## Alur Konversi PDF ke Markdown/JSON
#### 1. Inisialisasi & Setup
- **Import library dan setup direktori**  
    Langsung di script/notebook, bukan fungsi khusus.

#### 2. Loop untuk Setiap File PDF
- **Fungsi utama:** `process_pdf(pdf_file, ...)`
- **Inisialisasi model YOLO:**  
    `model = YOLO(get_latest_yolo_model_path())`

#### 3. Cek Apakah File Sudah Pernah Diproses
- **Fungsi:** `check_json_file_exists(json_result_path)`

#### 4. Buka PDF dan Siapkan Temp Folder Gambar
- **Library dan fungsi:** `pymupdf.open(pdf_path)`  
    Langsung di dalam: `process_pdf`

#### 5. Loop untuk Setiap Halaman PDF
- **Di dalam:** `process_pdf`  
    Untuk setiap halaman:

##### a. Render Halaman ke Gambar
- **Langsung di dalam:** `process_pdf`  
    (`page.get_pixmap()`, `page_image.save()`)

##### b. Deteksi Objek dengan YOLO (Opsional)
- **Fungsi:**  
    - `model.predict(image_path, ...)`
    - `yolo_to_pdf_rectangles(boxes, zoom)`
    - `draw_bounding_boxes(page, rectangles)`

##### c. Simpan Halaman ke PDF Sementara
- **Fungsi:**  
    `pymupdf.open()`, `temp_pdf.insert_pdf()`, `temp_pdf.save()`

##### d. Ekstrak Teks dengan Docling
- **Fungsi:** `extract_text_from_pdf_page(page_pdf_path, ...)`

##### e. Update JSON Hasil Sementara
- **Langsung di dalam:** `process_pdf`  
    (`with open(json_result_path, ...) as json_file: ...`)

##### f. Logging Proses
- **Fungsi:** `logging_process(status, message)`

#### 6. Simpan JSON Hasil Akhir
- **Langsung di dalam:** `process_pdf`  
    (`with open(json_result_path, ...) as json_file: ...`)

#### 7. Cleanup
- **Library:**  
    - `shutil.rmtree(temp_image_dir, ignore_errors=True)`
    - `for f in result_dir.glob("*.pdf"): f.unlink()`

#### 8. Kembalikan Status Proses
- **Fungsi:** `yield logging_process(...)` di dalam `process_pdf`


In [1]:
from pathlib import Path
from typing import Any
import os
import shutil
import gc
import json
import time
import math
from glob import glob

import pymupdf
from pymupdf import Page
from ultralytics import YOLO

In [2]:
from docling.datamodel.base_models import InputFormat
from docling.datamodel.pipeline_options import (
    AcceleratorDevice,
    AcceleratorOptions,
    PdfPipelineOptions,
    EasyOcrOptions,
)

from docling.datamodel.settings import settings # untuk debugging waktu konversi
from docling.document_converter import DocumentConverter, PdfFormatOption
from docling.utils.model_downloader import download_models

## Setup direktori berdasarkan struktur folder `app/`

In [3]:
OUTPUT_DIR = Path("app/results")
PDF_PATH = Path("app/temp/pdf")
TEMP_IMAGE_DIR = Path("app/temp/image")
ARTIFACT_PATH = Path("app/models")

dir_list = [
    OUTPUT_DIR,
    PDF_PATH,
    TEMP_IMAGE_DIR,
    ARTIFACT_PATH,
]

### Debugging

In [4]:
for dir_path in dir_list:
    exist = dir_path.exists()
    if not exist:
        dir_path.mkdir(parents=True, exist_ok=True)
    else:
        print(f"Directory {dir_path} already exists, skipping creation.")

Directory app\models already exists, skipping creation.


## Mencari model YOLO dan menggunakan yang terbaru

In [5]:
def get_latest_yolo_model_path(yolo_dir="app/yolo"):
    """
    Get the latest YOLO model file from the specified directory.
    Args:
        yolo_dir (str): Directory where YOLO model files are stored.
    Returns:
        Path: Path to the latest YOLO model file.
    Raises:
        FileNotFoundError: If no YOLO model files are found in the specified directory.
    """
    yolo_files = sorted(
        glob(str(Path(yolo_dir) / "*.pt")),
        key=lambda f: os.path.getmtime(f),
        reverse=True,
    )
    if not yolo_files:
        raise FileNotFoundError(f"YOLO model file not found at {yolo_dir}/")
    return Path(yolo_files[0])


## Mengubah bounding box pada hasil deteksi YOLO ke dalam format yang sesuai dengan PyMuPDF

In [6]:
def yolo_to_pdf_rectangles(boxes, zoom):
    """
    Converts YOLO-format bounding boxes to PyMuPDF rectangle objects, scaling coordinates by the given zoom factor.

    Args:
        boxes (list of list or tuple): A list of bounding boxes, where each box is represented as [x0, y0, x1, y1].
        zoom (float): The zoom factor to scale down the bounding box coordinates.

    Returns:
        List of PyMuPDF Rect objects
    """
    return [
        pymupdf.Rect(
            box[0] // zoom,  # x0
            box[1] // zoom,  # y0
            box[2] // zoom,  # x1
            box[3] // zoom,  # y1
        )
        for box in boxes
    ]


## Menghapus komponen atau elemen yang tidak diperlukan dengan membuat box berwarna putih

In [7]:
def draw_bounding_boxes(page: Page, rectangles: list[pymupdf.Rect]):
    """
    Draws white bounding boxes on the given PDF page by adding redaction annotations to the specified rectangles and applying the redactions.

    Args:
        page (Page): The PDF page object to draw bounding boxes on.
        rectangles (list[pymupdf.Rect]): A list of rectangle objects specifying the areas to be covered with bounding boxes.

    Returns:
        Page: The modified PDF page with the bounding boxes applied.
    """
    for rect in rectangles:
        page.add_redact_annot(rect, fill=(1, 1, 1)) # Fill with white color
    page.apply_redactions() # Apply the redactions to the page
    return page


## Proses konversi PDF ke dalam markdown menggunakan Docling

In [8]:
def extract_text_from_pdf_page(
    src_path, result_path, create_markdown, number_thread, force_full_page_ocr=False
):
    """Extract text from a PDF page using OCR if necessary.

    Args:
        - src_path (str): Path to the source PDF file.
        - result_path (str): Path to save the result.
        - create_markdown (bool): Whether to create a markdown file.
        - number_thread (int): Number of threads to use for OCR.
        - force_full_page_ocr (bool): Whether to force full page OCR. Default is False.

    Returns:
        - text (str): Extracted text from the PDF page.
        - doc_conversion_secs (float): Time taken for document conversion.
    """

    accelerator_options = AcceleratorOptions(
        num_threads=number_thread, device=AcceleratorDevice.AUTO
    ) # Jumlah thread yang digunakan selama proses, defaultnya 4.
    pipeline_options = PdfPipelineOptions()
    pipeline_options.artifacts_path = ARTIFACT_PATH
    pipeline_options.accelerator_options = accelerator_options
    pipeline_options.do_ocr = True
    pipeline_options.do_table_structure = True
    pipeline_options.images_scale = 2.0
    pipeline_options.table_structure_options.do_cell_matching = True
    pipeline_options.generate_picture_images = True
    settings.debug.profile_pipeline_timings = True

    # Referensi: 
    # - https://docling-project.github.io/docling/examples/custom_convert/
    # - https://docling-project.github.io/docling/examples/run_with_accelerator/

    # pipeline_options.ocr_options = TesseractCliOcrOptions(
    #     lang=["eng", "id"],
    #     force_full_page_ocr=force_full_page_ocr,
    #     tesseract_cmd="tesseract",
    # )


    pipeline_options.ocr_options = EasyOcrOptions(
        lang=["en", "id"],
        force_full_page_ocr=force_full_page_ocr,
    )

    converter = DocumentConverter(
        allowed_formats=[InputFormat.PDF, InputFormat.IMAGE],
        format_options={
            InputFormat.PDF: PdfFormatOption(
                pipeline_options=pipeline_options
            ),
        },
    )
    conv_result = converter.convert(src_path)
    doc_conversion_secs = round(conv_result.timings["pipeline_total"].times[0], 2)
    text = conv_result.document.export_to_markdown(escape_underscores=False)

    confidence_data = conv_result.confidence.model_dump()

    # Cek jika teks kosong
    if len(text.strip()) == 0 and force_full_page_ocr is False:
        # If the text is empty, it might be a scanned PDF, so we run OCR again with force_full_page_ocr=True
        return None, doc_conversion_secs, confidence_data

    if create_markdown:
        md_filename = f"{result_path}.md"
        with open(md_filename, "w+", encoding="utf-8") as md_file:
            md_file.write(text)

    return text, doc_conversion_secs, confidence_data


## Fungsi bantuan untuk logging proses ke halaman Streamlit dan pengecekan file json

In [9]:
def logging_process(status: str, message: str):
    """Logs the process status and message.

    Args:
        status (str): The status of the process.
        message (str): The message to log.

    Returns:
        dict: A dictionary containing the status and message.
    """
    return {
        "status": status,
        "message": message,
    }


In [10]:
def check_json_file_exists(file_path: Any | Path):
    """Checks if a JSON file exists and is have content.

    Args:
        file_path (str): The path to the JSON file.

    Returns:
        bool: True if the file exists and has content, False otherwise.
    """
    if file_path.exists():
        json_content = json.loads(file_path.read_text(encoding="utf-8"))
        total_pages = json_content.get("total_page", 0) or 0
        total_page_extracted = len(json_content.get("content", [])) or 0
        if total_pages == total_page_extracted:
            return True
    return False


## Pemrosesan Utama

In [11]:
def process_pdf(
    pdf_file: str,
    idx: int = 1,
    create_markdown=False,
    overwrite=True,
    exclude_object=True,
    number_thread: int = 4,
    output_dir: str | Path = None,
):
    """
    Process a PDF file, extracting text and optionally creating markdown files.
    Args:
        pdf_file (str): Path to the PDF file to process.
        idx (int): Index of the PDF file in the processing queue.
        create_markdown (bool): Whether to create markdown files from the extracted text.
        overwrite (bool): Whether to overwrite existing JSON results.
        exclude_object (bool): Whether to exclude objects detected by YOLO.
        number_thread (int): Number of threads to use for OCR.
        output_dir (str | Path): Directory to save the output results.
    Yields:
        dict: Status messages indicating the progress of the processing.
    """
    MODEL_YOLO = get_latest_yolo_model_path()
    print(f"Using YOLO model: {MODEL_YOLO}")
    model = YOLO(MODEL_YOLO)
    base_name = Path(pdf_file).stem
    pdf_path = pdf_file
    total = 0 # Initialize total pages in the PDF file

    list_models = [f for f in ARTIFACT_PATH.rglob("*") if f.is_file()]
    if not list_models:
        # Check if the models are already downloaded
        # If not, download them
        print("Models not found, downloading...")
        download_models(
            output_dir=ARTIFACT_PATH, progress=True
        )  # Ensure models are downloaded

    if create_markdown:
        result_dir = output_dir / base_name
        result_dir.mkdir(exist_ok=True)
        json_result_path = result_dir / f"{base_name}.json"
    else:
        result_dir = output_dir
        json_result_path = result_dir / f"{base_name}.json"

    if not overwrite and check_json_file_exists(json_result_path):
        yield logging_process(
            "info", f"[SKIP] JSON result already exists for {base_name}, skipping."
        )
        return

    try:
        with pymupdf.open(pdf_path) as pdf:
            result_json = {"content": [], "total_page": pdf.page_count}
            temp_image_dir = TEMP_IMAGE_DIR / base_name
            temp_image_dir.mkdir(parents=True, exist_ok=True)
            total = pdf.page_count
            total_times = 0

            for i, page in enumerate(pdf.pages()):
                page_index = i + 1
                zoom = 3
                mat = pymupdf.Matrix(zoom, zoom)
                page_image = page.get_pixmap(matrix=mat)
                image_path = temp_image_dir / f"{base_name}-page-{page_index}.png"
                page_image.save(str(image_path))

                if exclude_object:
                    # YOLO inference
                    results = model.predict(str(image_path), verbose=False, conf=0.5)

                    result_dict = {
                        "cls": results[0].boxes.cls.cpu().numpy(),
                        "box": results[0].boxes.xyxy.cpu().numpy(),
                    }

                    # Filter boxes based on class values
                    boxes = []
                    for i, cls_value in enumerate(result_dict["cls"]):
                        if cls_value == 0:
                            boxes.append(result_dict["box"][i])

                    rectangles = yolo_to_pdf_rectangles(boxes, zoom) if boxes else []
                    if rectangles:
                        page = draw_bounding_boxes(page, rectangles)

                    del results, result_dict, boxes, rectangles
                    gc.collect()

                page_pdf_path = result_dir / f"{base_name}-page-{page_index}.pdf"
                with pymupdf.open() as temp_pdf:
                    temp_pdf.insert_pdf(
                        pdf,
                        from_page=page.number,
                        to_page=page.number,
                        links=False,
                        widgets=False,
                    )
                    temp_pdf.save(str(page_pdf_path), garbage=4, deflate=True)

                # Checking if the PDF is scanned and needs OCR
                markdown_text, time_spent, confidence_data = extract_text_from_pdf_page(
                    page_pdf_path,
                    result_dir / f"{base_name}-page-{page_index}",
                    create_markdown,
                    number_thread,
                    force_full_page_ocr=False,
                )
                with pymupdf.open() as temp_pdf:
                    temp_pdf.insert_pdf(
                        pdf,
                        from_page=page.number,
                        to_page=page.number,
                        links=False,
                        widgets=False,
                    )
                    temp_pdf.save(str(page_pdf_path), garbage=4, deflate=True)

                if markdown_text is None:
                    yield logging_process(
                        "info",
                        f"Page {page_index}/{pdf.page_count} of {base_name} is empty, running OCR again.",
                    )
                    # If the text is empty, it might be a scanned PDF, so we run OCR again with force_full_page_ocr=True
                    markdown_text, time_spent, confidence_data = (
                        extract_text_from_pdf_page(
                            page_pdf_path,
                            result_dir / f"{base_name}-page-{page_index}",
                            create_markdown,
                            number_thread,
                            force_full_page_ocr=True,
                        )
                    )

                temp_content = {
                    "page": page_index,
                    "content": markdown_text,
                    "duration": time_spent,
                }

                confidence_data["pages"][0] = {
                    k: (None if isinstance(v, float) and math.isnan(v) else v)
                    for k, v in confidence_data["pages"][0].items()
                }

                temp_content.update(confidence_data["pages"][0])

                result_json["content"].append(temp_content)

                total_times += time_spent

                yield logging_process(
                    "info",
                    f"Processed page {page_index}/{pdf.page_count} of {base_name} in {time.strftime('%H:%M:%S', time.gmtime(time_spent))}",
                )

                del (
                    mat,
                    page_image,
                    page_pdf_path,
                    markdown_text,
                    time_spent,
                )
                gc.collect()

                with open(json_result_path, "w+", encoding="utf-8") as json_file:
                    json.dump(result_json, json_file, ensure_ascii=False, indent=2)

            # Save the total time taken for processing the PDF
            result_json["total_time"] = round(total_times, 2)

            with open(json_result_path, "w+", encoding="utf-8") as json_file:
                json.dump(
                    result_json,
                    json_file,
                    ensure_ascii=False,
                    indent=2,
                    allow_nan=False,
                )

        # Remove temp PDF files
        for f in result_dir.glob("*.pdf"):
            f.unlink()
        shutil.rmtree(temp_image_dir, ignore_errors=True)

        yield logging_process("success", f"Finished processing PDF: {base_name}")

    except Exception as e:
        yield logging_process("error", f"Failed to process PDF {idx + 1}/{total}: {e}")


In [14]:
for status in process_pdf(
    "app/temp/pdf/kap_pjj_penilaian_aset_tak_berwujud_djp_bagi_pegawai_djp.pdf",
    output_dir=OUTPUT_DIR,
):
    print(status)

Using YOLO model: app\yolo\best-1.pt


  from .autonotebook import tqdm as notebook_tqdm


{'status': 'info', 'message': 'Processed page 1/8 of kap_pjj_penilaian_aset_tak_berwujud_djp_bagi_pegawai_djp in 00:00:08'}
{'status': 'info', 'message': 'Processed page 2/8 of kap_pjj_penilaian_aset_tak_berwujud_djp_bagi_pegawai_djp in 00:00:10'}
{'status': 'info', 'message': 'Processed page 3/8 of kap_pjj_penilaian_aset_tak_berwujud_djp_bagi_pegawai_djp in 00:00:14'}
{'status': 'info', 'message': 'Processed page 4/8 of kap_pjj_penilaian_aset_tak_berwujud_djp_bagi_pegawai_djp in 00:00:17'}
{'status': 'info', 'message': 'Processed page 5/8 of kap_pjj_penilaian_aset_tak_berwujud_djp_bagi_pegawai_djp in 00:00:09'}
{'status': 'info', 'message': 'Processed page 6/8 of kap_pjj_penilaian_aset_tak_berwujud_djp_bagi_pegawai_djp in 00:00:10'}
{'status': 'info', 'message': 'Processed page 7/8 of kap_pjj_penilaian_aset_tak_berwujud_djp_bagi_pegawai_djp in 00:00:15'}
{'status': 'info', 'message': 'Processed page 8/8 of kap_pjj_penilaian_aset_tak_berwujud_djp_bagi_pegawai_djp in 00:00:03'}
{'status