In [8]:
from paddleocr import PaddleOCR, draw_ocr
import fitz
import numpy as np
from PIL import Image
import os

# Initialize OCR with better defaults for performance
def initialise(language="en"):
    global ocr
    ocr = PaddleOCR(
        use_angle_cls=False,  # Disable angle classification unless needed
        lang=language,
        use_gpu=False,  # Enable GPU if available
        enable_mkldnn=True,  # Enable MKL-DNN acceleration
        cpu_threads=20  # Optimize CPU threads
    )

# Cache the font path
_FONT_PATH = None
def get_font_path():
    global _FONT_PATH
    if _FONT_PATH is not None:
        return _FONT_PATH
        
    if os.name == 'nt':
        _FONT_PATH = r"C:/Windows/Fonts/arial.ttf"
    else:
        possible_paths = [
            "/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf",
            "/usr/share/fonts/TTF/DejaVuSans.ttf",
            "/System/Library/Fonts/Helvetica.ttc"
        ]
        for path in possible_paths:
            if os.path.exists(path):
                _FONT_PATH = path
                break
        if _FONT_PATH is None:
            raise FileNotFoundError("No suitable font found")
    return _FONT_PATH

def process_image(image, count, output_dir="output"):
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    
    # Optimize image before OCR
    img_pil = Image.fromarray(image)
    w, h = img_pil.size
    # Resize if image is very large
    if w > 2000 or h > 2000:
        ratio = min(2000/w, 2000/h)
        new_size = (int(w * ratio), int(h * ratio))
        img_pil = img_pil.resize(new_size, Image.Resampling.LANCZOS)
        image = np.array(img_pil)
    
    result = ocr.ocr(image, cls=False)  # Disable classifier for speed
    if not result or not result[0]:
        print(f"No text detected in image {count}")
        return
    
    boxes = [line[0] for line in result[0]]
    texts = [line[1][0] for line in result[0]]
    scores = [line[1][1] for line in result[0]]
    
    try:
        img_drawn = draw_ocr(img_pil, boxes, texts, scores, font_path=get_font_path())
        
        # Use JPEG for faster saving, adjust quality as needed
        output_path = os.path.join(output_dir, f"page_{count}.jpg")
        Image.fromarray(img_drawn).save(output_path, 'JPEG', quality=85)
        
        # Batch write text
        text_output = os.path.join(output_dir, f"page_{count}_text.txt")
        with open(text_output, 'w', encoding='utf-8') as f:
            f.writelines(f"{text} (confidence: {score:.2f})\n" for text, score in zip(texts, scores))
                
    except Exception as e:
        print(f"Error processing page {count}: {str(e)}")

def pdf_to_images(pdf_path, output_dir="output"):
    try:
        doc = fitz.open(pdf_path)
        total_pages = len(doc)
        print(f"Processing PDF with {total_pages} pages...")
        
        # Optimize PDF rendering
        for page_num in range(total_pages):
            try:
                page = doc[page_num]
                # Optimize resolution - adjust zoom factors as needed
                pix = page.get_pixmap(matrix=fitz.Matrix(2, 2))  # 2x zoom for better quality/speed balance
                img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
                img = np.array(img)
                
                process_image(img, page_num + 1, output_dir)
                
            except Exception as e:
                print(f"Error processing page {page_num + 1}: {str(e)}")
                continue
            
    except Exception as e:
        print(f"Error opening PDF: {str(e)}")
    finally:
        if 'doc' in locals():
            doc.close()

In [9]:
%%time
# Initialize OCR
initialise("en")

# Process your PDF
pdf_to_images("/home/ubuntu/AlphaBuffet/l99139ae10vkxpdfy.pdf")

[2025/02/12 16:56:37] ppocr DEBUG: Namespace(help='==SUPPRESS==', use_gpu=False, use_xpu=False, use_npu=False, use_mlu=False, ir_optim=True, use_tensorrt=False, min_subgraph_size=15, precision='fp32', gpu_mem=500, gpu_id=0, image_dir=None, page_num=0, det_algorithm='DB', det_model_dir='/home/ubuntu/.paddleocr/whl/det/en/en_PP-OCRv3_det_infer', det_limit_side_len=960, det_limit_type='max', det_box_type='quad', det_db_thresh=0.3, det_db_box_thresh=0.6, det_db_unclip_ratio=1.5, max_batch_size=10, use_dilation=False, det_db_score_mode='fast', det_east_score_thresh=0.8, det_east_cover_thresh=0.1, det_east_nms_thresh=0.2, det_sast_score_thresh=0.5, det_sast_nms_thresh=0.2, det_pse_thresh=0, det_pse_box_thresh=0.85, det_pse_min_area=16, det_pse_scale=1, scales=[8, 16, 32], alpha=1.0, beta=1.0, fourier_degree=5, rec_algorithm='SVTR_LCNet', rec_model_dir='/home/ubuntu/.paddleocr/whl/rec/en/en_PP-OCRv4_rec_infer', rec_image_inverse=True, rec_image_shape='3, 48, 320', rec_batch_num=6, max_text_le