In [None]:
# 1. Import libraries
# 2. Image -> OCR, generate a csv with all words
# 3. OCR, define specified words with coordinates
# 4. Apply white boxes to coordinates 

In [None]:
# Cell 1: Cell 1 Imports & helpers
from pathlib import Path
import json, random, time, csv
from faker import Faker
from PIL import Image, ImageDraw, ImageFont
import pytesseract
from pydantic import BaseModel, Field
from typing import List
import ollama
from concurrent.futures import ThreadPoolExecutor, TimeoutError
import difflib
import re

INPUT_DIR = Path("input_images")
OUTPUT_DIR = Path("output_statements")
OUTPUT_DIR.mkdir(exist_ok=True)

fake = Faker("en_US")
FONT = ImageFont.truetype("arial.ttf", 10)
pytesseract.pytesseract.tesseract_cmd = ...


In [5]:
# %% Cell 1: Cell 1 Imports & helpers but for my macbook
from pathlib import Path
import json, random, time, csv
from faker import Faker
from PIL import Image, ImageDraw, ImageFont
import pytesseract
from pydantic import BaseModel, Field
from typing import List
import ollama
from concurrent.futures import ThreadPoolExecutor, TimeoutError
import difflib
import re
import platform

INPUT_DIR = Path("input_images")
OUTPUT_DIR = Path("output_statements")
OUTPUT_DIR.mkdir(exist_ok=True)

fake = Faker("en_US")

# Set font based on operating system
if platform.system() == "Darwin":
    FONT = ImageFont.truetype("/Library/Fonts/Arial.ttf", 10)
elif platform.system() == "Windows":
    FONT = ImageFont.truetype("C:\\Windows\\Fonts\\arial.ttf", 10)
else:
    raise ValueError("Unsupported OS. Set FONT path manually.")

# Set Tesseract path based on operating system
if platform.system() == "Darwin":  # macOS
    pytesseract.pytesseract.tesseract_cmd = "/opt/homebrew/bin/tesseract"
    if not Path(pytesseract.pytesseract.tesseract_cmd).exists():
        pytesseract.pytesseract.tesseract_cmd = "/usr/local/bin/tesseract"
elif platform.system() == "Windows":  # Windows
    pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files\Tesseract-OCR\tesseract.exe"
else:
    raise ValueError("Unsupported operating system. Please set tesseract_cmd manually.")

if not Path(pytesseract.pytesseract.tesseract_cmd).exists():
    print(f"Warning: Tesseract not found at {pytesseract.pytesseract.tesseract_cmd}. Please install or adjust the path.")

In [9]:

# Cell 2: Text Extraction
def extract_text(img_path: Path):
    print(f"   • Extracting text from {img_path.name} …", end=" ", flush=True)
    try:
        img = Image.open(img_path).convert("RGB")
        text = pytesseract.image_to_string(img, config="--psm 6 --oem 3")
        # Improved tokenization: preserve numbers with commas and decimals
        tokens = []
        current_token = ""
        for char in text:
            if char.isspace():
                if current_token:
                    tokens.append(current_token)
                    current_token = ""
            elif char in [',', '.', '-'] and current_token and current_token[-1].isdigit():
                current_token += char  # Keep commas and periods in numbers
            elif char in [',', '.', '-', '/'] and not current_token:
                continue  # Skip standalone delimiters
            else:
                current_token += char
        if current_token:
            tokens.append(current_token)

        extracted = [token.strip() for token in tokens if token.strip()]
        print("done.")
        print(f"   • Extracted tokens: {extracted}")
        csv_path = OUTPUT_DIR / f"extracted_{img_path.stem}.csv"
        with open(csv_path, "w", newline="", encoding="utf-8") as f:
            writer = csv.writer(f)
            writer.writerow(["text"])  # Header
            for token in extracted:
                writer.writerow([token])
        print(f"   • Saved extracted tokens: {csv_path}")
        return extracted
    except Exception as e:
        print(f"error: {type(e).__name__}: {str(e)}")
        return []

def process_folder_extract():
    img_files = list(INPUT_DIR.glob("*.jp*")) + list(INPUT_DIR.glob("*.png"))
    if not img_files:
        print("⚠️  No images found in", INPUT_DIR.resolve())
        return
    for img in img_files:
        print(f"Processing {img.name} for text extraction")
        extracted = extract_text(img)
        if not extracted:
            print("No text extracted or processing failed")
        else:
            print(f"Extracted {len(extracted)} text items")

process_folder_extract()


Processing chase_highres.png for text extraction
   • Extracting text from chase_highres.png … done.
   • Extracted tokens: ['CHASE', '“', 'January', '1,', '2020', 'through', 'February', '23,', '2020', 'Primary', 'Account', '690783870', 'JPM', 'organ', 'Chase', 'Bank', 'NA.', 'Ohio/West', 'Virginia', 'Markets', 'POG', 'ae', '——SSSSSSS', 'Baton', 'Rouge.', 'LA', '70826-0180', 'CUSTOMER', 'SERVICE', 'INFORMATION', 'YS', 'TEE', 'TET', 'EL', 'WebSite:', 'www.Chase.com', 'Service', 'Center:', '1-800-935-9935', 'Hearing', 'Impaired:', '1-800-242-7383', 'Ldlaeaabd', 'dll', 'se', 'la!}', 'blaldaldh', 'bdbdlal', 'ParaEspand:', 'H7-2124275', 'ee', 'eusisen', 'CEA', 'el', 'KA', 'teat', 'sur', 'eles', 'International', 'Calls:', '1-713-262-1679', '—', 'Joseph', 'Cabrera', '—', '11020', 'NE', '14TH', 'AVE', '=', 'MIAMI', 'FL', '33161', '—>', '—?', 'Ee', 'ts)', '|', '—————', 'J', '—', '—', '—', '——', '—', '2', '——-', 'CHECKING', 'SUMMAR', 'INSTANCES', 'AMOUNT', 'Beginning', 'Balance', '$81,607.40', '

In [14]:
# Cell 3 – OCR-driven coordinates using multiple lists

from pathlib import Path
import json, re
from PIL import Image
import pytesseract

# ── 1.  Define your individual lists ─────────────────────────────
chase_header = {"JPM organ Chase Bank NA. Ohio/West Virginia Markets pS amas ees"}

date_header  = {"January 1, 2020 through February 29, 2020"}

account_number     = {"690783870"}
customer_mailing   = {"joseph cabrera", "miami fl 33161", "11020 ne 14th ave"}

checking_summary_instances = {"10", "2", "4", "70"}

checking_summary_amount = {
    "$81,607.40", "125 883.63", "- 3,169.04", "- 15025.68", "$189 296.31"
}

deposit_additions_amounts = {
    "$17,120.00", "24610.00", "1142400", "1349.00", "5 000.00", "3120.00",
    "33.138.00", "18 114.00", "6 908.63", "5 100.00", "$125883.63"
}
deposit_additions_description = {"deposit"}
deposit_additions_date        = {
    "01/02", "01/09", "01/14", "01/15", "01/21",
    "02/21", "02/23", "02/28", "02/29"
}

# ── 2.  Bundle them into categories ──────────────────────────────
WHITE_BOX_PHRASES: dict[str, set[str]] = {
    "sensitive_customer": customer_mailing | account_number,
    "transaction_desc"  : deposit_additions_description,
    "amount"            : checking_summary_amount | deposit_additions_amounts,
    "date"              : deposit_additions_date,
    "instance"          : checking_summary_instances,
}

# ── utility: word-level OCR boxes ────────────────────────────────
def ocr_word_boxes(img_path: Path):
    img  = Image.open(img_path).convert("RGB")
    data = pytesseract.image_to_data(
        img, output_type=pytesseract.Output.DICT, config="--psm 6 --oem 3"
    )
    words = []
    for i, txt in enumerate(data["text"]):
        txt = txt.strip()
        if not txt:
            continue
        words.append({
            "text": txt,
            "left": data["left"][i],
            "top":  data["top"][i],
            "right": data["left"][i] + data["width"][i],
            "bottom": data["top"][i] + data["height"][i],
            "width":  data["width"][i],
            "height": data["height"][i],
        })
    return words, img.size

# helper: canonicalise text for matching
def canon(txt: str) -> str:
    txt = txt.lower().strip()
    txt = re.sub(r'\s+', ' ', txt)
    return txt.replace('$', '').replace(',', '').replace('.', '')

# ── main coordinate builder ──────────────────────────────────────
def make_coords(img_path: Path):
    words, _ = ocr_word_boxes(img_path)
    coords   = []

    # 1️⃣ locate “Instances” header
    inst_col = None
    for w in words:
        if canon(w["text"]) in {"instances", "instance", "lnstances", "lntances"}:
            inst_col = (w["left"], w["right"])
            break

    # 2️⃣ normal matching pass
    for category, phrase_set in WHITE_BOX_PHRASES.items():
        for phrase in phrase_set:
            tokens = [canon(tok) for tok in phrase.split()]
            n = len(tokens)
            i = 0
            while i <= len(words) - n:
                if [canon(w["text"]) for w in words[i:i+n]] == tokens:
                    left   = min(w["left"]   for w in words[i:i+n])
                    top    = min(w["top"]    for w in words[i:i+n])
                    right  = max(w["right"]  for w in words[i:i+n])
                    bottom = max(w["bottom"] for w in words[i:i+n])

                    # filter “instance” hits by column (if header found)
                    if category == "instance" and inst_col:
                        x_mid = (left + right) // 2
                        col_left, col_right = inst_col
                        margin = 5           # ← original narrow margin
                        if not (col_left - margin <= x_mid <= col_right + margin):
                            i += 1
                            continue

                    coords.append({
                        "text": phrase,
                        "category": category,
                        "x": left,
                        "y": top,
                        "width":  right - left,
                        "height": bottom - top,
                    })
                    print(f"   • Found “{phrase}” → [{left},{top},{right-left},{bottom-top}]")
                    i += n
                else:
                    i += 1

    out_json = OUTPUT_DIR / f"white_box_coords_{img_path.stem}.json"
    out_json.write_text(json.dumps(coords, indent=2))
    print(f"   • Saved {len(coords)} coords → {out_json.name}")
    return coords

# convenience runner
def process_white_box_terms():
    img_path = INPUT_DIR / "chase_highres.png"
    if not img_path.exists():
        img_path = INPUT_DIR / "chase_highres.jpg"
    if not img_path.exists():
        print("⚠️  chase_highres image not found"); return

    print(f"Processing {img_path.name} for OCR coordinate generation")
    make_coords(img_path)

process_white_box_terms()

Processing chase_highres.png for OCR coordinate generation
   • Found “690783870” → [1689,226,188,32]
   • Found “joseph cabrera” → [143,678,238,33]
   • Found “miami fl 33161” → [142,754,243,42]
   • Found “11020 ne 14th ave” → [143,719,313,26]
   • Found “deposit” → [386,2054,121,42]
   • Found “deposit” → [386,2101,120,43]
   • Found “deposit” → [386,2148,121,40]
   • Found “deposit” → [386,2194,120,43]
   • Found “deposit” → [386,2240,121,43]
   • Found “deposit” → [386,2287,121,43]
   • Found “deposit” → [386,2333,121,43]
   • Found “deposit” → [386,2380,121,43]
   • Found “deposit” → [386,2426,121,43]
   • Found “deposit” → [386,2476,121,43]
   • Found “1142400” → [1882,2152,145,27]
   • Found “- 3,169.04” → [1294,1597,153,32]
   • Found “1349.00” → [1900,2199,127,27]
   • Found “$125883.63” → [1843,2532,187,34]
   • Found “18 114.00” → [1882,2388,145,27]
   • Found “$17,120.00” → [1861,2057,166,33]
   • Found “- 15025.68” → [1275,1646,169,32]
   • Found “5 100.00” → [1900,2481,1

In [None]:
# Cell 4a: Generate Red Box Preview
KEEP_CATEGORIES = {"sensitive_customer", "transaction_desc", "amount", "date", "instance"}
NEVER_BLANK = set()

def apply_red_box_preview(img_path, coord_path):
    coords = json.loads(coord_path.read_text())
    if not coords:
        print("Error: No coordinates found in JSON file")
        return None
    img = Image.open(img_path).convert("RGB")
    draw = ImageDraw.Draw(img)
    image_width, image_height = img.size

    boxes = 0
    for c in coords:
        cat = c.get("category")
        text = c.get("text", "").lower()

        if cat not in KEEP_CATEGORIES:
            continue
        if any(kw in text for kw in NEVER_BLANK):
            continue

        x, y, w, h = map(int, (c["x"], c["y"], c["width"], c["height"]))

        inset = 1
        x1 = max(0, x + inset)
        y1 = max(0, y + inset)
        x2 = min(image_width - 1, x + w - inset)
        y2 = min(image_height - 1, y + h - inset)
        if x2 <= x1 or y2 <= y1:
            continue

        draw.rectangle([x1, y1, x2, y2], outline="red", width=2)  # Red outline for preview
        boxes += 1
        print(f"   • Applied red box preview to '{c.get('text', 'unknown')}' at ({x}, {y}) with size ({w}, {h})")

    out = OUTPUT_DIR / f"{img_path.stem}_preview.png"
    img.save(out, dpi=(300, 300))
    print(f"   • {boxes} red boxes applied → {out.name}")
    return out

def process_red_box_preview():
    img_path = INPUT_DIR / "chase_highres.png"
    if not img_path.exists():
        img_path = INPUT_DIR / "chase_highres.jpg"
    if not img_path.exists():
        print(f"Error: Image not found at {img_path}")
        return
    coord_path = OUTPUT_DIR / f"white_box_coords_{img_path.stem}.json"
    if not coord_path.exists():
        print(f"Error: Coordinate file not found at {coord_path}")
        return
    print(f"Processing {img_path.name} for red box preview")
    apply_red_box_preview(img_path, coord_path)

process_red_box_preview()

Processing chase_highres.png for red box preview
   • Applied red box preview to '690783870' at (1689, 226) with size (188, 32)
   • Applied red box preview to 'joseph cabrera' at (143, 678) with size (238, 33)
   • Applied red box preview to 'miami fl 33161' at (142, 754) with size (243, 42)
   • Applied red box preview to '11020 ne 14th ave' at (143, 719) with size (313, 26)
   • Applied red box preview to 'deposit' at (386, 2054) with size (121, 42)
   • Applied red box preview to 'deposit' at (386, 2101) with size (120, 43)
   • Applied red box preview to 'deposit' at (386, 2148) with size (121, 40)
   • Applied red box preview to 'deposit' at (386, 2194) with size (120, 43)
   • Applied red box preview to 'deposit' at (386, 2240) with size (121, 43)
   • Applied red box preview to 'deposit' at (386, 2287) with size (121, 43)
   • Applied red box preview to 'deposit' at (386, 2333) with size (121, 43)
   • Applied red box preview to 'deposit' at (386, 2380) with size (121, 43)
   •

In [None]:
# Cell 4b: Apply White Boxes (Redaction Only)
KEEP_CATEGORIES = {"sensitive_customer", "transaction_desc", "amount", "date", "instance"}
NEVER_BLANK = set()

def apply_white_boxes(img_path, coord_path):
    coords = json.loads(coord_path.read_text())
    if not coords:
        print("Error: No coordinates found in JSON file")
        return None
    img = Image.open(img_path).convert("RGB")
    draw = ImageDraw.Draw(img)
    image_width, image_height = img.size

    boxes = 0
    for c in coords:
        cat = c.get("category")
        text = c.get("text", "").lower()

        if cat not in KEEP_CATEGORIES:
            continue
        if any(kw in text for kw in NEVER_BLANK):
            continue

        x, y, w, h = map(int, (c["x"], c["y"], c["width"], c["height"]))

        # Apply uniform box size for numeric fields (amount, instance, and account numbers)
        is_numeric = bool(re.fullmatch(r"\$?[0-9,.\-]+", text))
        if is_numeric:
            uniform_w, uniform_h = 200, 30  # Retain current size
            # Center the uniform box over the original coordinates
            x = max(0, x - (uniform_w - w) // 2)
            y = max(0, y - (uniform_h - h) // 2)
            w, h = uniform_w, uniform_h

        # Increase box size by 10% to fully cover text
        padding = max(5, int(min(w, h) * 0.05))  # Minimum 5px, max 10% of smallest dimension
        x1 = max(0, x - padding)
        y1 = max(0, y - padding)
        x2 = min(image_width - 1, x + w + padding)
        y2 = min(image_height - 1, y + h + padding)
        if x2 <= x1 or y2 <= y1:
            continue

        draw.rectangle([x1, y1, x2, y2], fill="white")
        boxes += 1
        print(f"   • Applied white box at ({x}, {y}) with size ({w + 2 * padding}, {h + 2 * padding})")

    out = OUTPUT_DIR / f"{img_path.stem}_redacted.png"
    img.save(out, dpi=(300, 300))
    print(f"   • {boxes} white boxes applied → {out.name}")
    return out

def process_white_box_application():
    img_path = INPUT_DIR / "chase_highres.png"
    if not img_path.exists():
        img_path = INPUT_DIR / "chase_highres.jpg"
    if not img_path.exists():
        print(f"Error: Image not found at {img_path}")
        return
    coord_path = OUTPUT_DIR / f"white_box_coords_{img_path.stem}.json"
    if not coord_path.exists():
        print(f"Error: Coordinate file not found at {coord_path}")
        return
    print(f"Processing {img_path.name} for white-box application")
    apply_white_boxes(img_path, coord_path)

process_white_box_application()

Processing chase_highres.png for white-box application
   • Applied white box at (1683, 227) with size (210, 40)
   • Applied white box at (143, 678) with size (248, 43)
   • Applied white box at (142, 754) with size (253, 52)
   • Applied white box at (143, 719) with size (323, 36)
   • Applied white box at (386, 2054) with size (131, 52)
   • Applied white box at (386, 2101) with size (130, 53)
   • Applied white box at (386, 2148) with size (131, 50)
   • Applied white box at (386, 2194) with size (130, 53)
   • Applied white box at (386, 2240) with size (131, 53)
   • Applied white box at (386, 2287) with size (131, 53)
   • Applied white box at (386, 2333) with size (131, 53)
   • Applied white box at (386, 2380) with size (131, 53)
   • Applied white box at (386, 2426) with size (131, 53)
   • Applied white box at (386, 2476) with size (131, 53)
   • Applied white box at (1855, 2151) with size (210, 40)
   • Applied white box at (1294, 1597) with size (163, 42)
   • Applied white

In [33]:
# %% Cell 5 — place synthetic text (auto-size + category-aware pools)
from pathlib import Path
import platform
from PIL import Image, ImageDraw, ImageFont, ImageEnhance
import json, random, re
from datetime import datetime
from faker import Faker

# ------------------------------------------------------------------
# 1.  constants / helpers
# ------------------------------------------------------------------
KEEP_CATEGORIES = {"chase_header", "sensitive_customer", "transaction_desc", "amount", "date", "instance"}
NEVER_BLANK = set()  # add strings here to protect them

fake = Faker("en_US")

# Fixed font size (30pt for 3x scaled image, equivalent to 10pt when scaled back)
fixed_font_size = 30

def mono_font(size: int):
    sys = platform.system()
    cand = (r"C:\Windows\Fonts\consola.ttf",
            "/System/Library/Fonts/Menlo.ttc",
            "/usr/share/fonts/truetype/dejavu/DejaVuSansMono.ttf")
    for p in cand:
        try: return ImageFont.truetype(p, size)
        except OSError: pass
    return ImageFont.load_default()

def prop_font(size: int):
    sys = platform.system()
    cand = (r"C:\Windows\Fonts\arial.ttf",
            "/Library/Fonts/Arial.ttf",
            "/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf")
    for p in cand:
        try: return ImageFont.truetype(p, size)
        except OSError: pass
    return ImageFont.load_default()

def bold_prop_font(size: int):
    sys = platform.system()
    cand = (r"C:\Windows\Fonts\arialbd.ttf",  # Arial Bold
            "/Library/Fonts/Arial Bold.ttf",
            "/usr/share/fonts/truetype/dejavu/DejaVuSans-Bold.ttf")
    for p in cand:
        try: return ImageFont.truetype(p, size)
        except OSError: pass
    return prop_font(size)  # Fall back to regular if bold not found

# ------------------------------------------------------------------
# 2.  build fresh pools
# ------------------------------------------------------------------
def gen_pools():
    pools = {
        "chase_header": [  # Ordered: branch name, department, address
            f"{fake.city()} Regional Office",  # e.g., "Denver Regional Office"
            "Customer Service Dept",
            f"{fake.city()}, {fake.state_abbr()} {fake.postcode()}"  # e.g., "Austin, TX 78701"
        ],
        "sensitive_customer": [],  # Will be populated with ordered data
        "transaction_desc": {"deposit", "withdrawal", "transfer", "payment"},
        "amount": {f"${random.uniform(100, 9000):,.2f}" for _ in range(30)},  # 100.00 to 9000.00 with decimals
        "date": {fake.date_between(datetime.strptime("2025-01-01", "%Y-%m-%d"), datetime.strptime("2025-02-28", "%Y-%m-%d")).strftime("%m/%d") for _ in range(30)},
        "instance": {str(random.randint(100, 9000)) for _ in range(30)},  # 100 to 9000, no decimals
    }
    # Generate ordered sensitive_customer data with account number first, name last
    account_num = str(random.randint(100000000, 999999999))  # 9-digit account number
    name = fake.name()  # e.g., "Travis Williams"
    street = fake.street_address()  # e.g., "123 Main St"
    city_state_zip = f"{fake.city()} {fake.state_abbr()} {fake.postcode()}"  # e.g., "Austin TX 78701"
    pools["sensitive_customer"] = [account_num, street, city_state_zip, name]  # Ordered: account, street, city-state-zip, name
    return pools

POOLS = gen_pools()

# ------------------------------------------------------------------
# 3.  main inserter
# ------------------------------------------------------------------
def insert_text(img_path: Path, coord_path: Path):
    coords = json.loads(coord_path.read_text())
    redacted = OUTPUT_DIR / f"{img_path.stem}_redacted.png"
    if not redacted.exists():
        print("❌ Run the redaction (Cell 4) first."); return
    im = Image.open(redacted).convert("RGB")
    
    # Image cleanup: Enhance contrast to make whites whiter and blacks blacker
    enhancer = ImageEnhance.Contrast(im)
    im = enhancer.enhance(2.0)  # Increase contrast by a factor of 2
    im = im.convert("RGB")  # Ensure RGB mode after enhancement
    
    drw = ImageDraw.Draw(im)
    W, H = im.size
    placed = 0

    # Sort all coords by y-position to handle header first
    all_coords = sorted(coords, key=lambda x: x["y"])
    header_coords = [c for c in all_coords if c["category"] == "chase_header"]
    sens_coords = [c for c in all_coords if c["category"] == "sensitive_customer"]
    other_coords = [c for c in all_coords if c["category"] not in {"chase_header", "sensitive_customer"}]

    # Handle header coords
    header_index = 0
    for c in header_coords:
        cat = c["category"]
        if cat not in KEEP_CATEGORIES: continue
        if any(kw in c["text"].lower() for kw in NEVER_BLANK): continue

        x, y, w, h = map(int, (c["x"], c["y"], c["width"], c["height"]))
        if w < 4 or h < 4: continue

        if header_index < len(POOLS["chase_header"]):
            txt = POOLS["chase_header"][header_index]
            header_index += 1
        else:
            txt = random.choice(list(POOLS.get(cat, {"—"})))

        fnt = prop_font(fixed_font_size)  # Fixed font size for all text, regular weight
        tw, th = drw.textbbox((0, 0), txt, font=fnt)[2:]
        tx = x + (w - tw) // 2  # Center-align header text
        ty = y + (h - th) // 2
        drw.text((tx, ty), txt, fill="black", font=fnt)
        placed += 1
        print(f"{cat:>16} | {txt:<15} @ {fixed_font_size}pt  box ({w}×{h})")

    # Handle sensitive_customer coords with explicit ordering and aligned text
    sens_index = 0
    for c in sens_coords:
        cat = c["category"]
        if cat not in KEEP_CATEGORIES: continue
        if any(kw in c["text"].lower() for kw in NEVER_BLANK): continue

        x, y, w, h = map(int, (c["x"], c["y"], c["width"], c["height"]))
        if w < 4 or h < 4: continue

        if sens_index < len(POOLS["sensitive_customer"]):
            txt = POOLS["sensitive_customer"][sens_index]
            sens_index += 1
        else:
            txt = random.choice(list(POOLS.get(cat, {"—"})))

        # Use bold font for all sensitive_customer fields (account, street, city-state-zip, name)
        fnt = bold_prop_font(fixed_font_size) if sens_index <= len(POOLS["sensitive_customer"]) else prop_font(fixed_font_size)
        tw, th = drw.textbbox((0, 0), txt, font=fnt)[2:]
        
        # Left-align all sensitive_customer text for uniform left edge
        tx = x + 5  # Add a small left padding (5px) for aesthetics
        # Align baselines vertically by using a fixed offset from the top
        ty = y + 5  # Fixed 5px offset from the top of the box (adjust as needed)

        drw.text((tx, ty), txt, fill="black", font=fnt)
        placed += 1
        print(f"{cat:>16} | {txt:<15} @ {fixed_font_size}pt  box ({w}×{h})")

    # Handle other coords
    for c in other_coords:
        cat = c["category"]
        if cat not in KEEP_CATEGORIES: continue
        if any(kw in c["text"].lower() for kw in NEVER_BLANK): continue

        x, y, w, h = map(int, (c["x"], c["y"], c["width"], c["height"]))
        if w < 4 or h < 4: continue

        txt = random.choice(list(POOLS.get(cat, {"—"})))

        fnt = prop_font(fixed_font_size)  # Fixed font size for all text, regular weight
        tw, th = drw.textbbox((0, 0), txt, font=fnt)[2:]
        tx = x + w - tw - 1 if bool(re.fullmatch(r"\$?[0-9,.\-]+", txt)) else x + (w - tw) // 2
        ty = y + (h - th) // 2
        drw.text((tx, ty), txt, fill="black", font=fnt)
        placed += 1
        print(f"{cat:>16} | {txt:<15} @ {fixed_font_size}pt  box ({w}×{h})")

    # Save to synthetic_out directory
    SYNTHETIC_OUT_DIR = Path("synthetic_out")
    SYNTHETIC_OUT_DIR.mkdir(exist_ok=True)
    out = SYNTHETIC_OUT_DIR / f"{img_path.stem}_synthetic.png"
    im.save(out, dpi=(300, 300))
    print(f"✅ Placed {placed} synthetic fields → {out.name}")

# convenience runner ------------------------------------------------
def process_dynamic_synthetic_insertion():
    img = INPUT_DIR / "chase_highres.png"
    if not img.exists(): img = INPUT_DIR / "chase_highres.jpg"
    coord = OUTPUT_DIR / f"white_box_coords_{img.stem}.json"
    if not coord.exists():
        print("❌ coordinate file missing – run Cell 3"); return
    insert_text(img, coord)

process_dynamic_synthetic_insertion()

sensitive_customer | 873225837       @ 30pt  box (188×32)
sensitive_customer | 722 Michael Ferry @ 30pt  box (238×33)
sensitive_customer | Allenport VA 45220 @ 30pt  box (313×26)
sensitive_customer | Patrick Chapman @ 30pt  box (243×42)
          amount | $753.81         @ 30pt  box (165×35)
          amount | $570.62         @ 30pt  box (166×31)
        instance | 7423            @ 30pt  box (32×27)
          amount | $6,199.47       @ 30pt  box (153×32)
        instance | 6186            @ 30pt  box (17×27)
          amount | $1,990.62       @ 30pt  box (169×32)
        instance | 1463            @ 30pt  box (20×25)
          amount | $8,785.56       @ 30pt  box (184×45)
        instance | 2733            @ 30pt  box (35×26)
transaction_desc | deposit         @ 30pt  box (121×42)
          amount | $1,990.62       @ 30pt  box (166×33)
            date | 02/25           @ 30pt  box (83×27)
transaction_desc | withdrawal      @ 30pt  box (120×43)
          amount | $7,994.29       @ 30p