In [None]:
# 1. Import libraries
# 2. Image -> OCR, generate a csv with all words
# 3. OCR, define specified words with coordinates
# 4. Apply white boxes to coordinates 

In [None]:
# Cell 1: Cell 1 Imports & helpers
from pathlib import Path
import json, random, time, csv
from faker import Faker
from PIL import Image, ImageDraw, ImageFont
import pytesseract
from pydantic import BaseModel, Field
from typing import List
import ollama
from concurrent.futures import ThreadPoolExecutor, TimeoutError
import difflib
import re

INPUT_DIR = Path("input_images")
OUTPUT_DIR = Path("output_statements")
OUTPUT_DIR.mkdir(exist_ok=True)

fake = Faker("en_US")
FONT = ImageFont.truetype("arial.ttf", 10)
pytesseract.pytesseract.tesseract_cmd = ...


In [None]:

# Cell 2: Text Extraction
def extract_text(img_path: Path):
    print(f"   • Extracting text from {img_path.name} …", end=" ", flush=True)
    try:
        img = Image.open(img_path).convert("RGB")
        text = pytesseract.image_to_string(img, config="--psm 6 --oem 3")
        # Improved tokenization: preserve numbers with commas and decimals
        tokens = []
        current_token = ""
        for char in text:
            if char.isspace():
                if current_token:
                    tokens.append(current_token)
                    current_token = ""
            elif char in [',', '.', '-'] and current_token and current_token[-1].isdigit():
                current_token += char  # Keep commas and periods in numbers
            elif char in [',', '.', '-', '/'] and not current_token:
                continue  # Skip standalone delimiters
            else:
                current_token += char
        if current_token:
            tokens.append(current_token)

        extracted = [token.strip() for token in tokens if token.strip()]
        print("done.")
        print(f"   • Extracted tokens: {extracted}")
        csv_path = OUTPUT_DIR / f"extracted_{img_path.stem}.csv"
        with open(csv_path, "w", newline="", encoding="utf-8") as f:
            writer = csv.writer(f)
            writer.writerow(["text"])  # Header
            for token in extracted:
                writer.writerow([token])
        print(f"   • Saved extracted tokens: {csv_path}")
        return extracted
    except Exception as e:
        print(f"error: {type(e).__name__}: {str(e)}")
        return []

def process_folder_extract():
    img_files = list(INPUT_DIR.glob("*.jp*")) + list(INPUT_DIR.glob("*.png"))
    if not img_files:
        print("⚠️  No images found in", INPUT_DIR.resolve())
        return
    for img in img_files:
        print(f"Processing {img.name} for text extraction")
        extracted = extract_text(img)
        if not extracted:
            print("No text extracted or processing failed")
        else:
            print(f"Extracted {len(extracted)} text items")

process_folder_extract()


In [None]:
# Cell 3 – OCR-driven coordinates using multiple lists

from pathlib import Path
import json, re
from PIL import Image
import pytesseract

# ── 1.  Define your individual lists ─────────────────────────────
chase_header = {"JPM organ Chase Bank NA. Ohio/West Virginia Markets pS amas ees"}

date_header  = {"January 1, 2020 through February 29, 2020"}

account_number     = {"690783870"}
customer_mailing   = {"joseph cabrera", "miami fl 33161", "11020 ne 14th ave"}

checking_summary_instances = {"10", "2", "by", "70"}

checking_summary_amount = {
    "$81,607.40", "125 883.63", "- 3,169.04", "- 15025.68", "$189 296.31"
}

deposit_additions_amounts = {
    "$17 120.00", "24610.00", "ti2408", "1349.00", "5 000.00", "3120.00",
    "33.138.00", "18 114.00", "6 908.63", "5 100.00", "$125883.63"
}
deposit_additions_description = {"deposit"}
deposit_additions_date        = {
    "01/02", "01/09", "0114", "01/15", "01/21",
    "02/21", "02/23", "02/28", "02/29"
}

# ── 2.  Bundle them into categories ──────────────────────────────
WHITE_BOX_PHRASES: dict[str, set[str]] = {
    "sensitive_customer": customer_mailing | account_number,
    "transaction_desc"  : deposit_additions_description,
    "amount"            : checking_summary_amount | deposit_additions_amounts,
    "date"              : deposit_additions_date,
    "instance"          : checking_summary_instances,
}

# ── utility: word-level OCR boxes ────────────────────────────────
def ocr_word_boxes(img_path: Path):
    img  = Image.open(img_path).convert("RGB")
    data = pytesseract.image_to_data(
        img, output_type=pytesseract.Output.DICT, config="--psm 6 --oem 3"
    )
    words = []
    for i, txt in enumerate(data["text"]):
        txt = txt.strip()
        if not txt:
            continue
        words.append({
            "text": txt,
            "left": data["left"][i],
            "top":  data["top"][i],
            "right": data["left"][i] + data["width"][i],
            "bottom": data["top"][i] + data["height"][i],
            "width":  data["width"][i],
            "height": data["height"][i],
        })
    return words, img.size

# helper: canonicalise text for matching
def canon(txt: str) -> str:
    txt = txt.lower().strip()
    txt = re.sub(r'\s+', ' ', txt)
    return txt.replace('$', '').replace(',', '').replace('.', '')

# ── main coordinate builder ──────────────────────────────────────
def make_coords(img_path: Path):
    words, _ = ocr_word_boxes(img_path)
    coords   = []

    # 1️⃣ locate “Instances” header
    inst_col = None
    for w in words:
        if canon(w["text"]) in {"instances", "instance", "lnstances", "lntances"}:
            inst_col = (w["left"], w["right"])
            break

    # 2️⃣ normal matching pass
    for category, phrase_set in WHITE_BOX_PHRASES.items():
        for phrase in phrase_set:
            tokens = [canon(tok) for tok in phrase.split()]
            n = len(tokens)
            i = 0
            while i <= len(words) - n:
                if [canon(w["text"]) for w in words[i:i+n]] == tokens:
                    left   = min(w["left"]   for w in words[i:i+n])
                    top    = min(w["top"]    for w in words[i:i+n])
                    right  = max(w["right"]  for w in words[i:i+n])
                    bottom = max(w["bottom"] for w in words[i:i+n])

                    # filter “instance” hits by column (if header found)
                    if category == "instance" and inst_col:
                        x_mid = (left + right) // 2
                        col_left, col_right = inst_col
                        margin = 5           # ← original narrow margin
                        if not (col_left - margin <= x_mid <= col_right + margin):
                            i += 1
                            continue

                    coords.append({
                        "text": phrase,
                        "category": category,
                        "x": left,
                        "y": top,
                        "width":  right - left,
                        "height": bottom - top,
                    })
                    print(f"   • Found “{phrase}” → [{left},{top},{right-left},{bottom-top}]")
                    i += n
                else:
                    i += 1

    out_json = OUTPUT_DIR / f"white_box_coords_{img_path.stem}.json"
    out_json.write_text(json.dumps(coords, indent=2))
    print(f"   • Saved {len(coords)} coords → {out_json.name}")
    return coords

# convenience runner
def process_white_box_terms():
    img_path = INPUT_DIR / "chase_highres.png"
    if not img_path.exists():
        img_path = INPUT_DIR / "chase_highres.jpg"
    if not img_path.exists():
        print("⚠️  chase_highres image not found"); return

    print(f"Processing {img_path.name} for OCR coordinate generation")
    make_coords(img_path)

process_white_box_terms()

In [None]:
# Cell 4: Apply White Boxes
def apply_white_boxes(img_path, coord_path):
    coords = json.loads(coord_path.read_text())
    img = Image.open(img_path).convert("RGB")
    draw = ImageDraw.Draw(img)
    image_width, image_height = img.size

    boxes = 0
    for c in coords:
        cat = c.get("category")
        text = c.get("text", "").lower()

        if cat not in KEEP_CATEGORIES:
            continue
        if any(kw in text for kw in NEVER_BLANK):
            continue

        x, y, w, h = map(int, (c["x"], c["y"], c["width"], c["height"]))

        # ❶ inset by one pixel on each side (min 1 px width / height)
        inset = 1
        x1 = max(0, x + inset)
        y1 = max(0, y + inset)
        x2 = min(img.width - 1, x + w - inset)
        y2 = min(img.height - 1, y + h - inset)
        if x2 <= x1 or y2 <= y1:
            continue        # too small after inset

        draw.rectangle([x1, y1, x2, y2], fill="white")
        boxes += 1

        print(f"   • Applied white box to '{c.get('text', 'unknown')}' at ({x}, {y}) with size ({w}, {h})")

    out = OUTPUT_DIR / f"{img_path.stem}_boxed.png"
    img.save(out, dpi=(300, 300))
    print(f"   • {boxes} white boxes applied → {out.name}")
    return out

def process_white_box_application():
    img_path = INPUT_DIR / "chase_highres.png"
    if not img_path.exists():
        img_path = INPUT_DIR / "chase_highres.jpg"
    coord_path = OUTPUT_DIR / f"white_box_coords_{img_path.stem}.json"
    print(f"Processing {img_path.name} for white-box application")
    apply_white_boxes(img_path, coord_path)

# Note: KEEP_CATEGORIES and NEVER_BLANK should be defined if used in apply_white_boxes
# Example: KEEP_CATEGORIES = {"sensitive_customer", "transaction_desc", "amount", "date", "instance"}
# Example: NEVER_BLANK = set()