In [36]:
import os
import ocrmypdf
from pathlib import Path
import pymupdf
import torch
import sys

sys.path.append('../src')

from utils import *
from config import *

In [2]:
# Run on GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f'Running code @ {device}')

Running code @ cpu


#### Apply OCR on scanned pdfs

In [None]:
# TODO: define better criteria for applying OCR, not only for scanned pdfs

In [27]:
def pdf_needs_ocr(pdf_path: str) -> bool:
    doc = pymupdf.open(pdf_path)

    return not any([page.get_text().strip() for page in doc])

def apply_ocr(pdf_path: str) -> str | None:
    """
    Check if document needs OCR, use ocrmypdf.
    Return path of output pdf.
    """

    if not pdf_needs_ocr(pdf_path):
        return None

    filename = pdf_path.split("/")[-1]
    dir = OCR_DIR / pdf_path.split("/")[-2]
    output_pdf_path = dir / filename

    os.makedirs(dir, exist_ok=True)

    if not os.path.exists(output_pdf_path):
        ocrmypdf.ocr(
            input_file=pdf_path,
            output_file=output_pdf_path.as_posix(),
            output_type="pdf",
            language="ron",
            deskew=True,
            rotate_pages=True,
            progress_bar=True,
            jobs=4,
        )

    return output_pdf_path.relative_to(BASE_DIR).as_posix()

In [37]:
def preprocess_pdfs(path=DATA_DIR) -> None:
    directories = get_directories(path)
    directories.remove(OCR_DIR.name)

    for directory in directories:
        pdfs = get_files_by_extension(path=os.path.join(DATA_DIR, directory), extension=".pdf")

        metadata_filepath = Path(path) / directory / "metadata.json"
        metadata = read_metadata(metadata_filepath)

        for filename in pdfs:
            pdf_path = DATA_DIR / directory / filename
            ocr_path = apply_ocr(pdf_path.as_posix())

            if ocr_path:
                metadata[filename]["ocr_path"] = ocr_path
        
        write_to_json_file(metadata_filepath.as_posix(), metadata)

preprocess_pdfs()