# 📄 LayoutLMv3 + PaddleOCR Contract Extraction Demo

Trích xuất thông tin từ hợp đồng PDF sử dụng PaddleOCR và PyMuPDF (fitz).

**Không cần poppler-utils, pytesseract nữa.**

---

In [None]:
# ✅ Cài đặt thư viện cần thiết
!pip install paddleocr pymupdf --upgrade --quiet
!pip install opencv-python --quiet

In [None]:
import fitz  # PyMuPDF
from paddleocr import PaddleOCR
from collections import defaultdict
import os
from PIL import Image

In [None]:
# ✅ Chuyển PDF thành ảnh
def pdf_to_images(pdf_path, output_dir="images"):
    os.makedirs(output_dir, exist_ok=True)
    doc = fitz.open(pdf_path)
    image_paths = []
    for i, page in enumerate(doc):
        pix = page.get_pixmap(dpi=300)
        img_path = os.path.join(output_dir, f"page_{i}.png")
        pix.save(img_path)
        image_paths.append(img_path)
    return image_paths

images = pdf_to_images("../store/input/your_contract_file.pdf")
Image.open(images[0])

In [None]:
# ✅ OCR + group dòng theo Y
ocr = PaddleOCR(use_angle_cls=True, lang='vi')

def ocr_image(image_path):
    result = ocr.ocr(image_path)
    lines = []
    for line in result[0]:
        text = line[1][0]
        y_center = int((line[0][0][1] + line[0][2][1]) / 2)
        lines.append((y_center, text))
    return lines

def group_lines_by_row(lines, y_tolerance=10):
    rows = defaultdict(list)
    for y, text in lines:
        found = False
        for row_y in rows:
            if abs(row_y - y) <= y_tolerance:
                rows[row_y].append(text)
                found = True
                break
        if not found:
            rows[y].append(text)
    return [rows[y] for y in sorted(rows)]

lines = ocr_image(images[0])
rows = group_lines_by_row(lines)
for r in rows:
    print(r)