In [None]:
# 1. Import libraries
# 2. Image -> OCR, generate a csv with all words
# 3. OCR, define specified words with coordinates
# 4. Apply white boxes to coordinates 

In [None]:
# Cell 1: Cell 1 Imports & helpers
from pathlib import Path
import json, random, time, csv
from faker import Faker
from PIL import Image, ImageDraw, ImageFont
import pytesseract
from pydantic import BaseModel, Field
from typing import List
import ollama
from concurrent.futures import ThreadPoolExecutor, TimeoutError
import difflib
import re

INPUT_DIR = Path("input_images")
OUTPUT_DIR = Path("output_statements")
OUTPUT_DIR.mkdir(exist_ok=True)

fake = Faker("en_US")
FONT = ImageFont.truetype("arial.ttf", 10)
pytesseract.pytesseract.tesseract_cmd = ...


In [8]:
# %% Cell 1: Cell 1 Imports & helpers
from pathlib import Path
import json, random, time, csv
from faker import Faker
from PIL import Image, ImageDraw, ImageFont
import pytesseract
from pydantic import BaseModel, Field
from typing import List
import ollama
from concurrent.futures import ThreadPoolExecutor, TimeoutError
import difflib
import re
import platform

INPUT_DIR = Path("input_images")
OUTPUT_DIR = Path("output_statements")
OUTPUT_DIR.mkdir(exist_ok=True)

fake = Faker("en_US")

# Set font based on operating system
if platform.system() == "Darwin":
    FONT = ImageFont.truetype("/Library/Fonts/Arial.ttf", 10)
elif platform.system() == "Windows":
    FONT = ImageFont.truetype("C:\\Windows\\Fonts\\arial.ttf", 10)
else:
    raise ValueError("Unsupported OS. Set FONT path manually.")

# Set Tesseract path based on operating system
if platform.system() == "Darwin":  # macOS
    pytesseract.pytesseract.tesseract_cmd = "/opt/homebrew/bin/tesseract"
    if not Path(pytesseract.pytesseract.tesseract_cmd).exists():
        pytesseract.pytesseract.tesseract_cmd = "/usr/local/bin/tesseract"
elif platform.system() == "Windows":  # Windows
    pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files\Tesseract-OCR\tesseract.exe"
else:
    raise ValueError("Unsupported operating system. Please set tesseract_cmd manually.")

if not Path(pytesseract.pytesseract.tesseract_cmd).exists():
    print(f"Warning: Tesseract not found at {pytesseract.pytesseract.tesseract_cmd}. Please install or adjust the path.")

In [9]:

# Cell 2: Text Extraction
def extract_text(img_path: Path):
    print(f"   • Extracting text from {img_path.name} …", end=" ", flush=True)
    try:
        img = Image.open(img_path).convert("RGB")
        text = pytesseract.image_to_string(img, config="--psm 6 --oem 3")
        # Improved tokenization: preserve numbers with commas and decimals
        tokens = []
        current_token = ""
        for char in text:
            if char.isspace():
                if current_token:
                    tokens.append(current_token)
                    current_token = ""
            elif char in [',', '.', '-'] and current_token and current_token[-1].isdigit():
                current_token += char  # Keep commas and periods in numbers
            elif char in [',', '.', '-', '/'] and not current_token:
                continue  # Skip standalone delimiters
            else:
                current_token += char
        if current_token:
            tokens.append(current_token)

        extracted = [token.strip() for token in tokens if token.strip()]
        print("done.")
        print(f"   • Extracted tokens: {extracted}")
        csv_path = OUTPUT_DIR / f"extracted_{img_path.stem}.csv"
        with open(csv_path, "w", newline="", encoding="utf-8") as f:
            writer = csv.writer(f)
            writer.writerow(["text"])  # Header
            for token in extracted:
                writer.writerow([token])
        print(f"   • Saved extracted tokens: {csv_path}")
        return extracted
    except Exception as e:
        print(f"error: {type(e).__name__}: {str(e)}")
        return []

def process_folder_extract():
    img_files = list(INPUT_DIR.glob("*.jp*")) + list(INPUT_DIR.glob("*.png"))
    if not img_files:
        print("⚠️  No images found in", INPUT_DIR.resolve())
        return
    for img in img_files:
        print(f"Processing {img.name} for text extraction")
        extracted = extract_text(img)
        if not extracted:
            print("No text extracted or processing failed")
        else:
            print(f"Extracted {len(extracted)} text items")

process_folder_extract()


Processing chase_highres.png for text extraction
   • Extracting text from chase_highres.png … done.
   • Extracted tokens: ['CHASE', '“', 'January', '1,', '2020', 'through', 'February', '23,', '2020', 'Primary', 'Account', '690783870', 'JPM', 'organ', 'Chase', 'Bank', 'NA.', 'Ohio/West', 'Virginia', 'Markets', 'POG', 'ae', '——SSSSSSS', 'Baton', 'Rouge.', 'LA', '70826-0180', 'CUSTOMER', 'SERVICE', 'INFORMATION', 'YS', 'TEE', 'TET', 'EL', 'WebSite:', 'www.Chase.com', 'Service', 'Center:', '1-800-935-9935', 'Hearing', 'Impaired:', '1-800-242-7383', 'Ldlaeaabd', 'dll', 'se', 'la!}', 'blaldaldh', 'bdbdlal', 'ParaEspand:', 'H7-2124275', 'ee', 'eusisen', 'CEA', 'el', 'KA', 'teat', 'sur', 'eles', 'International', 'Calls:', '1-713-262-1679', '—', 'Joseph', 'Cabrera', '—', '11020', 'NE', '14TH', 'AVE', '=', 'MIAMI', 'FL', '33161', '—>', '—?', 'Ee', 'ts)', '|', '—————', 'J', '—', '—', '—', '——', '—', '2', '——-', 'CHECKING', 'SUMMAR', 'INSTANCES', 'AMOUNT', 'Beginning', 'Balance', '$81,607.40', '

In [14]:
# Cell 3 – OCR-driven coordinates using multiple lists

from pathlib import Path
import json, re
from PIL import Image
import pytesseract

# ── 1.  Define your individual lists ─────────────────────────────
chase_header = {"JPM organ Chase Bank NA. Ohio/West Virginia Markets pS amas ees"}

date_header  = {"January 1, 2020 through February 29, 2020"}

account_number     = {"690783870"}
customer_mailing   = {"joseph cabrera", "miami fl 33161", "11020 ne 14th ave"}

checking_summary_instances = {"10", "2", "4", "70"}

checking_summary_amount = {
    "$81,607.40", "125 883.63", "- 3,169.04", "- 15025.68", "$189 296.31"
}

deposit_additions_amounts = {
    "$17,120.00", "24610.00", "1142400", "1349.00", "5 000.00", "3120.00",
    "33.138.00", "18 114.00", "6 908.63", "5 100.00", "$125883.63"
}
deposit_additions_description = {"deposit"}
deposit_additions_date        = {
    "01/02", "01/09", "01/14", "01/15", "01/21",
    "02/21", "02/23", "02/28", "02/29"
}

# ── 2.  Bundle them into categories ──────────────────────────────
WHITE_BOX_PHRASES: dict[str, set[str]] = {
    "sensitive_customer": customer_mailing | account_number,
    "transaction_desc"  : deposit_additions_description,
    "amount"            : checking_summary_amount | deposit_additions_amounts,
    "date"              : deposit_additions_date,
    "instance"          : checking_summary_instances,
}

# ── utility: word-level OCR boxes ────────────────────────────────
def ocr_word_boxes(img_path: Path):
    img  = Image.open(img_path).convert("RGB")
    data = pytesseract.image_to_data(
        img, output_type=pytesseract.Output.DICT, config="--psm 6 --oem 3"
    )
    words = []
    for i, txt in enumerate(data["text"]):
        txt = txt.strip()
        if not txt:
            continue
        words.append({
            "text": txt,
            "left": data["left"][i],
            "top":  data["top"][i],
            "right": data["left"][i] + data["width"][i],
            "bottom": data["top"][i] + data["height"][i],
            "width":  data["width"][i],
            "height": data["height"][i],
        })
    return words, img.size

# helper: canonicalise text for matching
def canon(txt: str) -> str:
    txt = txt.lower().strip()
    txt = re.sub(r'\s+', ' ', txt)
    return txt.replace('$', '').replace(',', '').replace('.', '')

# ── main coordinate builder ──────────────────────────────────────
def make_coords(img_path: Path):
    words, _ = ocr_word_boxes(img_path)
    coords   = []

    # 1️⃣ locate “Instances” header
    inst_col = None
    for w in words:
        if canon(w["text"]) in {"instances", "instance", "lnstances", "lntances"}:
            inst_col = (w["left"], w["right"])
            break

    # 2️⃣ normal matching pass
    for category, phrase_set in WHITE_BOX_PHRASES.items():
        for phrase in phrase_set:
            tokens = [canon(tok) for tok in phrase.split()]
            n = len(tokens)
            i = 0
            while i <= len(words) - n:
                if [canon(w["text"]) for w in words[i:i+n]] == tokens:
                    left   = min(w["left"]   for w in words[i:i+n])
                    top    = min(w["top"]    for w in words[i:i+n])
                    right  = max(w["right"]  for w in words[i:i+n])
                    bottom = max(w["bottom"] for w in words[i:i+n])

                    # filter “instance” hits by column (if header found)
                    if category == "instance" and inst_col:
                        x_mid = (left + right) // 2
                        col_left, col_right = inst_col
                        margin = 5           # ← original narrow margin
                        if not (col_left - margin <= x_mid <= col_right + margin):
                            i += 1
                            continue

                    coords.append({
                        "text": phrase,
                        "category": category,
                        "x": left,
                        "y": top,
                        "width":  right - left,
                        "height": bottom - top,
                    })
                    print(f"   • Found “{phrase}” → [{left},{top},{right-left},{bottom-top}]")
                    i += n
                else:
                    i += 1

    out_json = OUTPUT_DIR / f"white_box_coords_{img_path.stem}.json"
    out_json.write_text(json.dumps(coords, indent=2))
    print(f"   • Saved {len(coords)} coords → {out_json.name}")
    return coords

# convenience runner
def process_white_box_terms():
    img_path = INPUT_DIR / "chase_highres.png"
    if not img_path.exists():
        img_path = INPUT_DIR / "chase_highres.jpg"
    if not img_path.exists():
        print("⚠️  chase_highres image not found"); return

    print(f"Processing {img_path.name} for OCR coordinate generation")
    make_coords(img_path)

process_white_box_terms()

Processing chase_highres.png for OCR coordinate generation
   • Found “690783870” → [1689,226,188,32]
   • Found “joseph cabrera” → [143,678,238,33]
   • Found “miami fl 33161” → [142,754,243,42]
   • Found “11020 ne 14th ave” → [143,719,313,26]
   • Found “deposit” → [386,2054,121,42]
   • Found “deposit” → [386,2101,120,43]
   • Found “deposit” → [386,2148,121,40]
   • Found “deposit” → [386,2194,120,43]
   • Found “deposit” → [386,2240,121,43]
   • Found “deposit” → [386,2287,121,43]
   • Found “deposit” → [386,2333,121,43]
   • Found “deposit” → [386,2380,121,43]
   • Found “deposit” → [386,2426,121,43]
   • Found “deposit” → [386,2476,121,43]
   • Found “1142400” → [1882,2152,145,27]
   • Found “- 3,169.04” → [1294,1597,153,32]
   • Found “1349.00” → [1900,2199,127,27]
   • Found “$125883.63” → [1843,2532,187,34]
   • Found “18 114.00” → [1882,2388,145,27]
   • Found “$17,120.00” → [1861,2057,166,33]
   • Found “- 15025.68” → [1275,1646,169,32]
   • Found “5 100.00” → [1900,2481,1

In [15]:
# %% Cell 4a: Generate Red Box Preview
KEEP_CATEGORIES = {"sensitive_customer", "transaction_desc", "amount", "date", "instance"}
NEVER_BLANK = set()

def apply_red_box_preview(img_path, coord_path):
    coords = json.loads(coord_path.read_text())
    if not coords:
        print("Error: No coordinates found in JSON file")
        return None
    img = Image.open(img_path).convert("RGB")
    draw = ImageDraw.Draw(img)
    image_width, image_height = img.size

    boxes = 0
    for c in coords:
        cat = c.get("category")
        text = c.get("text", "").lower()

        if cat not in KEEP_CATEGORIES:
            continue
        if any(kw in text for kw in NEVER_BLANK):
            continue

        x, y, w, h = map(int, (c["x"], c["y"], c["width"], c["height"]))

        inset = 1
        x1 = max(0, x + inset)
        y1 = max(0, y + inset)
        x2 = min(image_width - 1, x + w - inset)
        y2 = min(image_height - 1, y + h - inset)
        if x2 <= x1 or y2 <= y1:
            continue

        draw.rectangle([x1, y1, x2, y2], outline="red", width=2)  # Red outline for preview
        boxes += 1
        print(f"   • Applied red box preview to '{c.get('text', 'unknown')}' at ({x}, {y}) with size ({w}, {h})")

    out = OUTPUT_DIR / f"{img_path.stem}_preview.png"
    img.save(out, dpi=(300, 300))
    print(f"   • {boxes} red boxes applied → {out.name}")
    return out

def process_red_box_preview():
    img_path = INPUT_DIR / "chase_highres.png"
    if not img_path.exists():
        img_path = INPUT_DIR / "chase_highres.jpg"
    if not img_path.exists():
        print(f"Error: Image not found at {img_path}")
        return
    coord_path = OUTPUT_DIR / f"white_box_coords_{img_path.stem}.json"
    if not coord_path.exists():
        print(f"Error: Coordinate file not found at {coord_path}")
        return
    print(f"Processing {img_path.name} for red box preview")
    apply_red_box_preview(img_path, coord_path)

process_red_box_preview()

Processing chase_highres.png for red box preview
   • Applied red box preview to '690783870' at (1689, 226) with size (188, 32)
   • Applied red box preview to 'joseph cabrera' at (143, 678) with size (238, 33)
   • Applied red box preview to 'miami fl 33161' at (142, 754) with size (243, 42)
   • Applied red box preview to '11020 ne 14th ave' at (143, 719) with size (313, 26)
   • Applied red box preview to 'deposit' at (386, 2054) with size (121, 42)
   • Applied red box preview to 'deposit' at (386, 2101) with size (120, 43)
   • Applied red box preview to 'deposit' at (386, 2148) with size (121, 40)
   • Applied red box preview to 'deposit' at (386, 2194) with size (120, 43)
   • Applied red box preview to 'deposit' at (386, 2240) with size (121, 43)
   • Applied red box preview to 'deposit' at (386, 2287) with size (121, 43)
   • Applied red box preview to 'deposit' at (386, 2333) with size (121, 43)
   • Applied red box preview to 'deposit' at (386, 2380) with size (121, 43)
   •

In [16]:
# %% Cell 4b: Apply White Boxes Final Output
KEEP_CATEGORIES = {"sensitive_customer", "transaction_desc", "amount", "date", "instance"}
NEVER_BLANK = set()

def apply_white_boxes(img_path, coord_path):
    coords = json.loads(coord_path.read_text())
    if not coords:
        print("Error: No coordinates found in JSON file")
        return None
    img = Image.open(img_path).convert("RGB")
    draw = ImageDraw.Draw(img)
    image_width, image_height = img.size

    boxes = 0
    for c in coords:
        cat = c.get("category")
        text = c.get("text", "").lower()

        if cat not in KEEP_CATEGORIES:
            continue
        if any(kw in text for kw in NEVER_BLANK):
            continue

        x, y, w, h = map(int, (c["x"], c["y"], c["width"], c["height"]))

        inset = 1
        x1 = max(0, x + inset)
        y1 = max(0, y + inset)
        x2 = min(image_width - 1, x + w - inset)
        y2 = min(image_height - 1, y + h - inset)
        if x2 <= x1 or y2 <= y1:
            continue

        draw.rectangle([x1, y1, x2, y2], fill="white")
        boxes += 1
        print(f"   • Applied white box to '{c.get('text', 'unknown')}' at ({x}, {y}) with size ({w}, {h})")

    out = OUTPUT_DIR / f"{img_path.stem}_boxed.png"
    img.save(out, dpi=(300, 300))
    print(f"   • {boxes} white boxes applied → {out.name}")
    return out

def process_white_box_application():
    img_path = INPUT_DIR / "chase_highres.png"
    if not img_path.exists():
        img_path = INPUT_DIR / "chase_highres.jpg"
    if not img_path.exists():
        print(f"Error: Image not found at {img_path}")
        return
    coord_path = OUTPUT_DIR / f"white_box_coords_{img_path.stem}.json"
    if not coord_path.exists():
        print(f"Error: Coordinate file not found at {coord_path}")
        return
    print(f"Processing {img_path.name} for white-box application")
    apply_white_boxes(img_path, coord_path)

process_white_box_application()

Processing chase_highres.png for white-box application
   • Applied white box to '690783870' at (1689, 226) with size (188, 32)
   • Applied white box to 'joseph cabrera' at (143, 678) with size (238, 33)
   • Applied white box to 'miami fl 33161' at (142, 754) with size (243, 42)
   • Applied white box to '11020 ne 14th ave' at (143, 719) with size (313, 26)
   • Applied white box to 'deposit' at (386, 2054) with size (121, 42)
   • Applied white box to 'deposit' at (386, 2101) with size (120, 43)
   • Applied white box to 'deposit' at (386, 2148) with size (121, 40)
   • Applied white box to 'deposit' at (386, 2194) with size (120, 43)
   • Applied white box to 'deposit' at (386, 2240) with size (121, 43)
   • Applied white box to 'deposit' at (386, 2287) with size (121, 43)
   • Applied white box to 'deposit' at (386, 2333) with size (121, 43)
   • Applied white box to 'deposit' at (386, 2380) with size (121, 43)
   • Applied white box to 'deposit' at (386, 2426) with size (121, 43)

In [None]:
# Cell 5: PDF Imports & Helpers
from pdf2image import convert_from_path
import os

PDF_INPUT_DIR = Path("pdf_inputs")
PDF_OUTPUT_DIR = Path("pdf_output_statements")
PDF_OUTPUT_DIR.mkdir(exist_ok=True)

# Ensure Poppler is configured for pdf2image
# Note: Set the poppler_path if needed, e.g., poppler_path=r"C:\path\to\poppler\bin"

# %% Cell 6: PDF Text Extraction
def extract_text_from_pdf(pdf_path: Path, page_num: int):
    print(f"   • Extracting text from {pdf_path.name} (page {page_num}) …", end=" ", flush=True)
    try:
        # Convert PDF page to image
        images = convert_from_path(pdf_path, first_page=page_num, last_page=page_num, dpi=300)
        if not images:
            print("error: No images extracted from PDF page")
            return []
        img = images[0]

        # Save temporary image for OCR
        temp_img_path = PDF_OUTPUT_DIR / f"temp_{pdf_path.stem}_page{page_num}.png"
        img.save(temp_img_path, "PNG")

        # Perform OCR
        text = pytesseract.image_to_string(img, config="--psm 6 --oem 3")
        tokens = []
        current_token = ""
        for char in text:
            if char.isspace():
                if current_token:
                    tokens.append(current_token)
                    current_token = ""
            elif char in [',', '.', '-'] and current_token and current_token[-1].isdigit():
                current_token += char  # Keep commas and periods in numbers
            elif char in [',', '.', '-', '/'] and not current_token:
                continue  # Skip standalone delimiters
            else:
                current_token += char
        if current_token:
            tokens.append(current_token)

        extracted = [token.strip() for token in tokens if token.strip()]
        print("done.")
        print(f"   • Extracted tokens: {extracted}")
        csv_path = PDF_OUTPUT_DIR / f"extracted_{pdf_path.stem}_page{page_num}.csv"
        with open(csv_path, "w", newline="", encoding="utf-8") as f:
            writer = csv.writer(f)
            writer.writerow(["text"])
            for token in extracted:
                writer.writerow([token])
        print(f"   • Saved extracted tokens: {csv_path}")
        return extracted
    except Exception as e:
        print(f"error: {type(e).__name__}: {str(e)}")
        return []

def process_pdf_extract():
    pdf_files = list(PDF_INPUT_DIR.glob("*.pdf"))
    if not pdf_files:
        print("⚠️  No PDFs found in", PDF_INPUT_DIR.resolve())
        return
    for pdf_path in pdf_files:
        print(f"Processing {pdf_path.name} for text extraction")
        # Get number of pages in PDF
        images = convert_from_path(pdf_path, dpi=300)
        for page_num in range(len(images)):
            extracted = extract_text_from_pdf(pdf_path, page_num + 1)
            if not extracted:
                print(f"No text extracted or processing failed for page {page_num + 1}")
            else:
                print(f"Extracted {len(extracted)} text items from page {page_num + 1}")

process_pdf_extract()

# %% Cell 7: PDF Coordinate Generation
def make_coords_from_pdf(pdf_path: Path, page_num: int):
    print(f"Processing {pdf_path.name} (page {page_num}) for OCR coordinate generation")
    # Convert PDF page to image
    images = convert_from_path(pdf_path, first_page=page_num, last_page=page_num, dpi=300)
    if not images:
        print("error: No images extracted from PDF page")
        return []
    img = images[0]
    temp_img_path = PDF_OUTPUT_DIR / f"temp_{pdf_path.stem}_page{page_num}.png"
    img.save(temp_img_path, "PNG")

    words, img_size = ocr_word_boxes(temp_img_path)
    coords = []

    # 1️⃣ Locate the header “Instances” to determine column boundaries
    inst_col = None
    header_tokens = {"instances", "instance", "lnstances", "lntances"}  # Add common OCR slips
    for w in words:
        if canon(w["text"]) in header_tokens:
            inst_col = (w["left"], w["right"])
            break

    # 2️⃣ If header not detected, derive column from first instance hit (10, 2, 4, 70)
    if inst_col is None:
        pending_instance = []
        for category, phrase_set in WHITE_BOX_PHRASES.items():
            for phrase in phrase_set:
                tokens = [canon(tok) for tok in phrase.split()]
                n = len(tokens)
                i = 0
                while i <= len(words) - n:
                    slice_txt = [canon(w["text"]) for w in words[i:i+n]]
                    if slice_txt == tokens:
                        left = min(w["left"] for w in words[i:i+n])
                        top = min(w["top"] for w in words[i:i+n])
                        right = max(w["right"] for w in words[i:i+n])
                        bottom = max(w["bottom"] for w in words[i:i+n])
                        coord = {
                            "text": phrase,
                            "category": category,
                            "x": int(left),
                            "y": int(top),
                            "width": int(right - left),
                            "height": int(bottom - top)
                        }
                        if category == "instance":
                            pending_instance.append(coord)
                        else:
                            coords.append(coord)
                            print(f"   • Found “{phrase}” → [{left},{top},{right-left},{bottom-top}]")
                        i += n
                    else:
                        i += 1
        if pending_instance:
            xs = [c["x"] for c in pending_instance]
            widths = [c["width"] for c in pending_instance]
            inst_col = (min(xs), max(x + w for x, w in zip(xs, widths)))
            print(f"   • Instances column derived: {inst_col}")

    # 3️⃣ Apply coordinates, filtering 'instance' hits to the detected column
    for category, phrase_set in WHITE_BOX_PHRASES.items():
        for phrase in phrase_set:
            tokens = [canon(tok) for tok in phrase.split()]
            n = len(tokens)
            i = 0
            while i <= len(words) - n:
                slice_txt = [canon(w["text"]) for w in words[i:i+n]]
                if slice_txt == tokens:
                    left = min(w["left"] for w in words[i:i+n])
                    top = min(w["top"] for w in words[i:i+n])
                    right = max(w["right"] for w in words[i:i+n])
                    bottom = max(w["bottom"] for w in words[i:i+n])

                    coord = {
                        "text": phrase,
                        "category": category,
                        "x": int(left),
                        "y": int(top),
                        "width": int(right - left),
                        "height": int(bottom - top)
                    }

                    if category == "instance" and inst_col:
                        x_mid = (left + right) // 2
                        col_left, col_right = inst_col
                        margin = 5  # px slack
                        if col_left - margin <= x_mid <= col_right + margin:
                            coords.append(coord)
                            print(f"   • Instance boxed “{phrase}” → [{left},{top},{right-left},{bottom-top}]")
                    else:
                        coords.append(coord)
                        print(f"   • Found “{phrase}” → [{left},{top},{right-left},{bottom-top}]")
                    i += n
                else:
                    i += 1

    out_json = PDF_OUTPUT_DIR / f"white_box_coords_{pdf_path.stem}_page{page_num}.json"
    out_json.write_text(json.dumps(coords, indent=2))
    print(f"   • Saved {len(coords)} coords → {out_json.name}")
    return coords

def process_pdf_coordinates():
    pdf_files = list(PDF_INPUT_DIR.glob("*.pdf"))
    if not pdf_files:
        print("⚠️  No PDFs found in", PDF_INPUT_DIR.resolve())
        return
    for pdf_path in pdf_files:
        print(f"Processing {pdf_path.name} for coordinate generation")
        images = convert_from_path(pdf_path, dpi=300)
        for page_num in range(len(images)):
            make_coords_from_pdf(pdf_path, page_num + 1)

process_pdf_coordinates()

# %% Cell 8: PDF White Box Application
def apply_white_boxes_to_pdf(pdf_path: Path, coord_path: Path):
    coords = json.loads(coord_path.read_text())
    # Convert PDF page to image
    images = convert_from_path(pdf_path, first_page=int(coord_path.stem.split("_page")[-1].split(".")[0]),
                              last_page=int(coord_path.stem.split("_page")[-1].split(".")[0]), dpi=300)
    if not images:
        print("error: No images extracted from PDF page")
        return
    img = images[0]
    draw = ImageDraw.Draw(img)
    image_width, image_height = img.size

    boxes = 0
    for c in coords:
        cat = c.get("category")
        text = c.get("text", "").lower()

        if cat not in KEEP_CATEGORIES:
            continue
        if any(kw in text for kw in NEVER_BLANK):
            continue

        x, y, w, h = map(int, (c["x"], c["y"], c["width"], c["height"]))

        # ❶ inset by one pixel on each side (min 1 px width / height)
        inset = 1
        x1 = max(0, x + inset)
        y1 = max(0, y + inset)
        x2 = min(image_width - 1, x + w - inset)
        y2 = min(image_height - 1, y + h - inset)
        if x2 <= x1 or y2 <= y1:
            continue  # too small after inset

        draw.rectangle([x1, y1, x2, y2], fill="white")
        boxes += 1

        print(f"   • Applied white box to '{c.get('text', 'unknown')}' at ({x}, {y}) with size ({w}, {h})")

    out = PDF_OUTPUT_DIR / f"{pdf_path.stem}_page{coord_path.stem.split('_page')[-1]}_boxed.png"
    img.save(out, dpi=(300, 300))
    print(f"   • {boxes} white boxes applied → {out.name}")
    return out

def process_pdf_redaction():
    coord_files = list(PDF_OUTPUT_DIR.glob("white_box_coords_*.json"))
    if not coord_files:
        print("⚠️  No coordinate files found in", PDF_OUTPUT_DIR.resolve())
        return
    for coord_path in coord_files:
        pdf_path = PDF_INPUT_DIR / f"{coord_path.stem.replace('white_box_coords_', '').split('_page')[0]}.pdf"
        if pdf_path.exists():
            print(f"Processing {pdf_path.name} for white-box application")
            apply_white_boxes_to_pdf(pdf_path, coord_path)

process_pdf_redaction()