In [3]:
# -*- coding: utf-8 -*-
"""
Folder of PDFs -> one TXT per PDF (Thai/English)
- Recursively finds all *.pdf in input_dir
- Tries digital text first (fast, accurate)
- Falls back to EasyOCR (memory-safe: streams pages, caps size, tiles if needed)
- Writes ONE combined <pdf_stem>_full.txt next to each PDF
"""

from pathlib import Path
import gc
import sys
import traceback

# ====== CONFIG (edit as needed) ==============================================
input_dir_str = r"../../resource"  # <--- change to your folder
LANGS = ['th', 'en']               # OCR languages
USE_GPU = False                    # Set True if you have CUDA
MAX_DIM = 1600                     # Longest side for rendered page (lower if OOM)
CANVAS_SIZE = 1920                 # EasyOCR detector canvas; try 1536 if RAM is tight
TILE_H = 900                       # Tile height for OCR fallback
TILE_OVERLAP = 80                  # Overlap between tiles
DIGITAL_TEXT_MINLEN = 200          # If digital text len >= this, skip OCR
ENCODING = "utf-8-sig"             # Helps Thai display in Windows Notepad
SKIP_EXISTING = True               # Skip PDFs if corresponding _full.txt already exists
# ============================================================================

try:
    import fitz  # PyMuPDF
except ImportError:
    print("Please: pip install pymupdf", file=sys.stderr)
    raise

def extract_digital_text(pdf_path: Path) -> str:
    """Extracts text without OCR. Returns concatenated text for all pages."""
    out = []
    with fitz.open(str(pdf_path)) as doc:
        for p in doc:
            txt = p.get_text("text")
            out.append((txt or "").strip())
    return "\n\n==== PAGE BREAK ====\n\n".join(out).strip()

def iter_pdf_images(pdf_path: Path, max_dim: int = 1600, min_zoom: float = 1.0, max_zoom: float = 2.0):
    """Yield each page as a numpy RGB image, scaled so longest side <= max_dim."""
    import numpy as np
    doc = fitz.open(str(pdf_path))
    try:
        for page in doc:
            w, h = page.rect.width, page.rect.height
            base_max = max(w, h)
            zoom = max(min_zoom, min(max_zoom, max_dim / base_max))
            mat = fitz.Matrix(zoom, zoom)
            pix = page.get_pixmap(matrix=mat, alpha=False)
            img = np.frombuffer(pix.samples, dtype=np.uint8).reshape(pix.height, pix.width, pix.n)
            yield img
            del img
            gc.collect()
    finally:
        doc.close()

def ocr_with_tiling(reader, img, tile_h: int = 1100, overlap: int = 80, **ocr_kwargs) -> str:
    """OCR a tall image in vertical tiles to reduce peak memory."""
    H, _W = img.shape[:2]
    y = 0
    texts = []
    while y < H:
        y0 = max(0, y - overlap)
        y1 = min(H, y + tile_h)
        tile = img[y0:y1, :, :]
        lines = reader.readtext(tile, detail=0, paragraph=True, **ocr_kwargs)
        texts.append("\n".join(lines))
        y += tile_h - overlap
        del tile
        gc.collect()
    return "\n".join(texts)

def ocr_pdf_to_text(pdf_path: Path) -> str:
    """Digital-first; OCR fallback (streaming, capped size, tiling). Returns full combined text."""
    # 1) Try digital text layer
    digital = extract_digital_text(pdf_path)
    if len(digital) >= DIGITAL_TEXT_MINLEN:
        print(f"[INFO] {pdf_path.name}: digital text detected -> no OCR")
        return digital

    print(f"[INFO] {pdf_path.name}: little/no digital text -> OCR")
    # 2) OCR fallback
    try:
        import easyocr
    except ImportError:
        print("Please: pip install easyocr", file=sys.stderr)
        raise

    reader = easyocr.Reader(LANGS, gpu=USE_GPU)
    OCR_KW = dict(canvas_size=CANVAS_SIZE, mag_ratio=1.0, batch_size=1, workers=0)

    page_texts = []
    for img in iter_pdf_images(pdf_path, max_dim=MAX_DIM):
        try:
            lines = reader.readtext(img, detail=0, paragraph=True, **OCR_KW)
            page_texts.append("\n".join(lines))
        except RuntimeError as e:
            if "DefaultCPUAllocator: not enough memory" in str(e):
                print(f"[WARN] {pdf_path.name}: OOM on full page -> tiling")
                txt = ocr_with_tiling(reader, img, tile_h=TILE_H, overlap=TILE_OVERLAP, **OCR_KW)
                page_texts.append(txt)
            else:
                raise
        del img
        gc.collect()

    return "\n\n==== PAGE BREAK ====\n\n".join(page_texts).strip()

def process_pdf(pdf_path: Path) -> Path:
    """Process one PDF and write <stem>_full.txt next to it. Returns txt path."""
    out_txt = pdf_path.with_suffix("").with_name(pdf_path.stem + "_full.txt")
    if SKIP_EXISTING and out_txt.exists():
        print(f"[SKIP] Already exists: {out_txt.name}")
        return out_txt

    full_text = ocr_pdf_to_text(pdf_path)
    with open(out_txt, "w", encoding=ENCODING, newline="\n") as f:
        f.write(full_text or "")
    print(f"[DONE] {pdf_path.name} -> {out_txt.name}")
    return out_txt

def main():
    in_dir = Path(input_dir_str)
    if not in_dir.exists():
        raise FileNotFoundError(f"Input folder not found: {in_dir.resolve()}")

    pdfs = sorted(in_dir.rglob("*.pdf"))
    if not pdfs:
        print(f"[INFO] No PDFs found under: {in_dir.resolve()}")
        return

    print(f"[INFO] Found {len(pdfs)} PDF(s) under {in_dir.resolve()}\n")
    success, failed = 0, 0
    for i, pdf_path in enumerate(pdfs, start=1):
        try:
            print(f"[{i}/{len(pdfs)}] Processing: {pdf_path}")
            process_pdf(pdf_path)
            success += 1
        except Exception:
            failed += 1
            print(f"[ERROR] Failed: {pdf_path.name}")
            traceback.print_exc()
        finally:
            gc.collect()

    print(f"\n[SUMMARY] Success: {success}  Failed: {failed}")


In [4]:
main()

[INFO] Found 10 PDF(s) under D:\Users\Dell\Documents\GeoSpark\rag-app\backend\resource

[1/10] Processing: ..\..\resource\107b991c3b5e1bdc.pdf
[SKIP] Already exists: 107b991c3b5e1bdc_full.txt
[2/10] Processing: ..\..\resource\การปรับตัวรองรับการเปลี่ยนแปลงสภาพภูมิอากาศของเกษตรกรชาวสวนยางพาราจังหวัดระยอง.pdf
[SKIP] Already exists: การปรับตัวรองรับการเปลี่ยนแปลงสภาพภูมิอากาศของเกษตรกรชาวสวนยางพาราจังหวัดระยอง_full.txt
[3/10] Processing: ..\..\resource\ปัจจัยที่มีอิทธิพลต่อการปรับตัวของเกษตรกรชาวสวนยางพาราต่อการเปลี่ยนแปลงสภาพภูมิอากาสในพื้นที่ภาคใต้ตอนล่างของประเทศไทย.pdf
[SKIP] Already exists: ปัจจัยที่มีอิทธิพลต่อการปรับตัวของเกษตรกรชาวสวนยางพาราต่อการเปลี่ยนแปลงสภาพภูมิอากาสในพื้นที่ภาคใต้ตอนล่างของประเทศไทย_full.txt
[4/10] Processing: ..\..\resource\ผลกระทบของการเปลี่ยนแปลงสภาพภูมิอากาศต่อผลผลิตยางพาราในพื้นที่เขตภาคใต้ตอนล่างของประเทศไทย.pdf
[SKIP] Already exists: ผลกระทบของการเปลี่ยนแปลงสภาพภูมิอากาศต่อผลผลิตยางพาราในพื้นที่เขตภาคใต้ตอนล่างของประเทศไทย_full.txt
[5/10] Processing: .

In [5]:
# -*- coding: utf-8 -*-
"""
PDFs -> Markdown (.md) with table handling (Thai/English)
- Recursively finds all *.pdf in input_dir
- DIGITAL-FIRST to Markdown via PyMuPDF4LLM (handles headings/lists/tables well)
- If PyMuPDF4LLM unavailable or output is too short, fallback:
  - pdfplumber text + table extraction -> reconstruct Markdown tables
- Final fallback (scanned PDFs): EasyOCR (streamed, tiled). (Tables degrade to plain text)
- Writes ONE <pdf_stem>.md next to each PDF

Install (pick what you need):
    pip install pymupdf pymupdf4llm pdfplumber easyocr pypdf pillow numpy

Note: To respect “no long sentences in tables”, long cells are truncated
and their full text is listed under the table as “Notes”.
"""

from pathlib import Path
import gc
import sys
import traceback
import re

# ===================== CONFIG =====================
input_dir_str = r"../../resource"   # <--- change to your folder
ENCODING = "utf-8-sig"              # Windows-friendly for Thai
SKIP_EXISTING = True                # Skip PDFs if *.md already exists

# Thresholds / behavior
DIGITAL_MD_MINLEN = 300             # if digital markdown len >= this, accept
TRUNCATE_TABLE_CELL_CHARS = 120     # avoid long sentences in MD tables
REPLACE_VERTICAL_BAR = "¦"          # replace '|' in cells to avoid MD break

# OCR fallback (for scanned PDFs)
LANGS = ['th', 'en']                # EasyOCR languages
USE_GPU = False                     # True if you have CUDA
MAX_DIM = 1600                      # Longest side when rasterizing pages
CANVAS_SIZE = 1920                  # EasyOCR detector canvas
TILE_H = 900                        # Tile height for OCR
TILE_OVERLAP = 80                   # Overlap between tiles
# ==================================================


# ----------------- Utilities -----------------
def normalize_whitespace(s: str) -> str:
    return re.sub(r"[ \t]+", " ", (s or "").replace("\r", "")).strip()

def safe_cell(s):
    if s is None:
        return ""
    s = str(s).replace("|", REPLACE_VERTICAL_BAR)
    s = s.replace("\n", "<br>")
    return s.strip()

def make_md_table(rows):
    """
    Build a Markdown table from list-of-rows (strings).
    Enforces short cells; long cells get footnotes under the table.
    """
    if not rows:
        return "", 0

    # Normalize rows to strings
    rows = [[safe_cell(c) for c in (r or [])] for r in rows]
    # Make all rows same length
    max_cols = max((len(r) for r in rows), default=0)
    rows = [r + [""] * (max_cols - len(r)) for r in rows]

    # Use first row as header; if it's mostly numeric/blank, synthesize headers
    header = rows[0] if rows else []
    if sum(1 for c in header if c and not c.replace(".", "", 1).isdigit()) < max(1, max_cols // 2):
        header = [f"Col{i+1}" for i in range(max_cols)]
        body = rows
    else:
        body = rows[1:]

    # Truncate long cells and collect notes
    notes = []
    def maybe_trunc(c, r_idx, c_idx):
        if len(c) > TRUNCATE_TABLE_CELL_CHARS:
            note_idx = len(notes) + 1
            notes.append((note_idx, r_idx, c_idx, c))
            return f"…[{note_idx}]"
        return c

    header = [maybe_trunc(c, 0, j) for j, c in enumerate(header)]
    new_body = []
    for i, r in enumerate(body, start=1):
        new_body.append([maybe_trunc(c, i, j) for j, c in enumerate(r)])
    body = new_body

    # Build MD
    md = []
    md.append("| " + " | ".join(header) + " |")
    md.append("| " + " | ".join(["---"] * max_cols) + " |")
    for r in body:
        md.append("| " + " | ".join(r) + " |")

    if notes:
        md.append("")
        md.append("_Notes_:")
        for idx, r_i, c_i, txt in notes:
            md.append(f"- [{idx}] R{r_i+1}C{c_i+1}: {txt}")

    return "\n".join(md), len(notes)


# ----------------- DIGITAL FIRST: PyMuPDF4LLM -----------------
def digital_md_pymupdf4llm(pdf_path: Path) -> str:
    try:
        import pymupdf4llm  # type: ignore
    except Exception:
        return ""
    try:
        # Fast, layout-aware Markdown (handles tables / bullets / headers)
        md = pymupdf4llm.to_markdown(
            str(pdf_path),
            page_chunks=False,
            write_images=False,   # no images in MD
            ocr=False             # we handle OCR fallback separately
        )
        return (md or "").strip()
    except Exception:
        return ""


# ----------------- DIGITAL FALLBACK: pdfplumber -----------------
def digital_md_pdfplumber(pdf_path: Path) -> str:
    """
    Page-by-page:
      - extract tables (multiple strategies)
      - extract page text
      - compose Markdown with tables + text
    """
    try:
        import pdfplumber  # type: ignore
    except Exception:
        return ""

    parts = []
    try:
        with pdfplumber.open(str(pdf_path)) as pdf:
            for pi, page in enumerate(pdf.pages, start=1):
                page_lines = []

                # Try multi-strategy table detection
                table_settings_list = [
                    {"vertical_strategy": "lines", "horizontal_strategy": "lines"},
                    {"vertical_strategy": "text", "horizontal_strategy": "text"},
                    {"vertical_strategy": "lines", "horizontal_strategy": "text"},
                    {"vertical_strategy": "text", "horizontal_strategy": "lines"},
                ]

                found_table = False
                used_tables = []

                for ts in table_settings_list:
                    try:
                        tables = page.extract_tables(table_settings=ts) or []
                        # Filter out single-cell artifacts
                        tables = [t for t in tables if t and any(any(c for c in row) for row in t)]
                        if tables:
                            used_tables = tables
                            found_table = True
                            break
                    except Exception:
                        continue

                if found_table:
                    page_lines.append(f"## Page {pi} — Tables")
                    for ti, tbl in enumerate(used_tables, start=1):
                        # Normalize rows and drop totally empty rows
                        norm_rows = []
                        for row in tbl:
                            row = [normalize_whitespace(c) for c in (row or [])]
                            if any(row):
                                norm_rows.append(row)
                        if not norm_rows:
                            continue

                        md_table, _notes = make_md_table(norm_rows)
                        page_lines.append(f"**Table {pi}.{ti}**")
                        page_lines.append(md_table)
                        page_lines.append("")

                # Page text (avoid dupe by not trying to subtract table areas)
                txt = page.extract_text() or ""
                txt = txt.strip()
                if txt:
                    page_lines.append(f"## Page {pi} — Text")
                    page_lines.append(txt)

                if page_lines:
                    parts.append("\n\n".join(page_lines))
    except Exception:
        return ""

    return ("\n\n----\n\n".join(parts)).strip()


# ----------------- SCANNED FALLBACK: EasyOCR -----------------
def iter_pdf_images(pdf_path: Path, max_dim: int = 1600, min_zoom: float = 1.0, max_zoom: float = 2.0):
    """Yield each page as a numpy RGB image, scaled so longest side <= max_dim."""
    import numpy as np
    import fitz  # PyMuPDF
    doc = fitz.open(str(pdf_path))
    try:
        for page in doc:
            w, h = page.rect.width, page.rect.height
            base_max = max(w, h)
            zoom = max(min_zoom, min(max_zoom, max_dim / base_max))
            mat = fitz.Matrix(zoom, zoom)
            pix = page.get_pixmap(matrix=mat, alpha=False)
            img = np.frombuffer(pix.samples, dtype=np.uint8).reshape(pix.height, pix.width, pix.n)
            yield img
            del img
            gc.collect()
    finally:
        doc.close()

def ocr_with_tiling(reader, img, tile_h: int, overlap: int, **ocr_kwargs) -> str:
    """OCR a tall image in vertical tiles to reduce peak memory."""
    H, _W = img.shape[:2]
    y = 0
    texts = []
    while y < H:
        y0 = max(0, y - overlap)
        y1 = min(H, y + tile_h)
        tile = img[y0:y1, :, :]
        lines = reader.readtext(tile, detail=0, paragraph=True, **ocr_kwargs)
        texts.append("\n".join(lines))
        y += tile_h - overlap
        del tile
        gc.collect()
    return "\n".join(texts)

def ocr_pdf_to_md(pdf_path: Path) -> str:
    try:
        import easyocr  # type: ignore
    except ImportError:
        return ""

    reader = easyocr.Reader(LANGS, gpu=USE_GPU)
    OCR_KW = dict(canvas_size=CANVAS_SIZE, mag_ratio=1.0, batch_size=1, workers=0)

    page_md = []
    for i, img in enumerate(iter_pdf_images(pdf_path, max_dim=MAX_DIM), start=1):
        try:
            lines = reader.readtext(img, detail=0, paragraph=True, **OCR_KW)
            txt = "\n".join(lines)
        except RuntimeError as e:
            if "not enough memory" in str(e):
                txt = ocr_with_tiling(reader, img, tile_h=TILE_H, overlap=TILE_OVERLAP, **OCR_KW)
            else:
                raise
        finally:
            del img
            gc.collect()

        if txt.strip():
            page_md.append(f"## Page {i}\n\n{txt.strip()}")

    return ("\n\n----\n\n".join(page_md)).strip()


# ----------------- Orchestration -----------------
def pick_md(pdf_path: Path) -> str:
    """Try digital markdown; fallback to tables/text; then OCR."""
    # 1) Best: PyMuPDF4LLM
    md = digital_md_pymupdf4llm(pdf_path)
    if len(md) >= DIGITAL_MD_MINLEN:
        return md

    # 2) pdfplumber table+text
    md2 = digital_md_pdfplumber(pdf_path)
    if len(md2) >= DIGITAL_MD_MINLEN or (md2 and md == ""):
        return md2

    # 3) OCR fallback
    md3 = ocr_pdf_to_md(pdf_path)
    return md3 or md or md2  # return whatever we got


def process_pdf(pdf_path: Path) -> Path:
    out_md = pdf_path.with_suffix(".md")
    if SKIP_EXISTING and out_md.exists():
        print(f"[SKIP] Already exists: {out_md.name}")
        return out_md

    md_text = pick_md(pdf_path)
    with open(out_md, "w", encoding=ENCODING, newline="\n") as f:
        f.write(md_text or "")
    print(f"[DONE] {pdf_path.name} -> {out_md.name}")
    return out_md


def main():
    in_dir = Path(input_dir_str)
    if not in_dir.exists():
        raise FileNotFoundError(f"Input folder not found: {in_dir.resolve()}")

    pdfs = sorted(in_dir.rglob("*.pdf"))
    if not pdfs:
        print(f"[INFO] No PDFs found under: {in_dir.resolve()}")
        return

    print(f"[INFO] Found {len(pdfs)} PDF(s) under {in_dir.resolve()}\n")
    success, failed = 0, 0
    for i, pdf_path in enumerate(pdfs, start=1):
        try:
            print(f"[{i}/{len(pdfs)}] {pdf_path}")
            process_pdf(pdf_path)
            success += 1
        except Exception:
            failed += 1
            print(f"[ERROR] Failed: {pdf_path.name}")
            traceback.print_exc()
        finally:
            gc.collect()

    print(f"\n[SUMMARY] Success: {success}  Failed: {failed}")

In [6]:
main()

[INFO] Found 10 PDF(s) under D:\Users\Dell\Documents\GeoSpark\rag-app\backend\resource

[1/10] ..\..\resource\107b991c3b5e1bdc.pdf
[SKIP] Already exists: 107b991c3b5e1bdc.md
[2/10] ..\..\resource\การปรับตัวรองรับการเปลี่ยนแปลงสภาพภูมิอากาศของเกษตรกรชาวสวนยางพาราจังหวัดระยอง.pdf
[SKIP] Already exists: การปรับตัวรองรับการเปลี่ยนแปลงสภาพภูมิอากาศของเกษตรกรชาวสวนยางพาราจังหวัดระยอง.md
[3/10] ..\..\resource\ปัจจัยที่มีอิทธิพลต่อการปรับตัวของเกษตรกรชาวสวนยางพาราต่อการเปลี่ยนแปลงสภาพภูมิอากาสในพื้นที่ภาคใต้ตอนล่างของประเทศไทย.pdf
[SKIP] Already exists: ปัจจัยที่มีอิทธิพลต่อการปรับตัวของเกษตรกรชาวสวนยางพาราต่อการเปลี่ยนแปลงสภาพภูมิอากาสในพื้นที่ภาคใต้ตอนล่างของประเทศไทย.md
[4/10] ..\..\resource\ผลกระทบของการเปลี่ยนแปลงสภาพภูมิอากาศต่อผลผลิตยางพาราในพื้นที่เขตภาคใต้ตอนล่างของประเทศไทย.pdf
[SKIP] Already exists: ผลกระทบของการเปลี่ยนแปลงสภาพภูมิอากาศต่อผลผลิตยางพาราในพื้นที่เขตภาคใต้ตอนล่างของประเทศไทย.md
[5/10] ..\..\resource\มาตรการชะลอการเก็บเกี่ยวมันสำปะหลังปี2566_67.pdf
[SKIP] Already exists

Using CPU. Note: This module is much faster with a GPU.
Using CPU. Note: This module is much faster with a GPU.


[DONE] รายงานการเปลี่ยนแปลงสภาพภูมิอากาศ.pdf -> รายงานการเปลี่ยนแปลงสภาพภูมิอากาศ.md
[8/10] ..\..\resource\สถานการณ์ยางพฤษภาคม2568.pdf


Using CPU. Note: This module is much faster with a GPU.


[DONE] สถานการณ์ยางพฤษภาคม2568.pdf -> สถานการณ์ยางพฤษภาคม2568.md
[9/10] ..\..\resource\แผนปฏิบัติการประจําปีงบประมาณปี2567ของการยางแห่งประเทศไทย.pdf


: 