In [1]:
%load_ext autoreload
%autoreload 2


In [2]:
from IPython.display import clear_output, display

In [3]:
import os
import torch
import numpy as np
import cv2
from PIL import Image
import matplotlib.pyplot as plt
from fpdf import FPDF
import os
import json
import re

from utils.pdf_process import load_all_pdf_bytes_from_directory, structure_pdf_images

In [None]:
from models.paddle_ocr import Paddle

In [5]:
paddle = Paddle()

In [6]:
pdfs = load_all_pdf_bytes_from_directory("demo")
print(f"{len(pdfs)} amount PDF found:")
for name in pdfs.keys():
    print("📄", name)

3 amount PDF found:
📄 20250221092842541.pdf
📄 20250221125114588.pdf
📄 Invoice_2.pdf


In [7]:
pdf_structured_data = structure_pdf_images(pdfs)
for name, meta in pdf_structured_data.items():
    print(f"📄 {meta['filename']} → {meta['num_pages']} page")

📄 20250221092842541.pdf → 2 page
📄 20250221125114588.pdf → 8 page
📄 Invoice_2.pdf → 5 page


In [8]:
for pdf_name, meta in pdf_structured_data.items():
    ocr_pages = []
    print(f"📄 {pdf_name} Begin OCR process...")

    for i, img in enumerate(meta["pages"]):
        _, ocr_result = paddle.perform_ocr(img, visualize=False, class_format=True)
        ocr_pages.append({
            "page": i + 1,
            "texts": ocr_result.texts,
            "boxes": ocr_result.boxes,
            "scores": ocr_result.scores
        })


    meta["ocr_results"] = ocr_pages

📄 20250221092842541.pdf Begin OCR process...
📄 20250221125114588.pdf Begin OCR process...
📄 Invoice_2.pdf Begin OCR process...


In [9]:
from models.claude import ClaudeParser

In [None]:
def load_api_key(file_path="api_key.txt"):
    if not os.path.exists(file_path):
        raise FileNotFoundError(f"API key file '{file_path}' not found.")
    with open(file_path, "r", encoding="utf-8") as f:
        return f.read().strip()

api_key = load_api_key()

In [None]:
claude = ClaudeParser(api_key=api_key)

In [11]:
for pdf_name, meta in pdf_structured_data.items():
    all_texts = []
    for page in meta["ocr_results"]:
        all_texts.extend(page["texts"])

    result_json = claude.ask(all_texts)

    output_path = os.path.join("output", f"{pdf_name}_claude_output.json")
    with open(output_path, "w", encoding="utf-8") as f:
        json.dump(result_json, f, indent=2, ensure_ascii=False)

    print(f"{pdf_name} → Saved Claude JSON to {output_path}")

20250221092842541.pdf → Saved Claude JSON to output\20250221092842541.pdf_claude_output.json
20250221125114588.pdf → Saved Claude JSON to output\20250221125114588.pdf_claude_output.json
Invoice_2.pdf → Saved Claude JSON to output\Invoice_2.pdf_claude_output.json
