In [1]:
# Cell 1: Import libraries
import os
import json
import re
from PIL import Image, ImageFilter, ImageEnhance
import pytesseract
import pandas as pd
import numpy as np
from faker import Faker
from datetime import datetime, timedelta
import random
import ollama
from pydantic import BaseModel, Field
from typing import List
from reportlab.lib import colors
from reportlab.lib.pagesizes import letter
from reportlab.lib.units import inch
from reportlab.platypus import SimpleDocTemplate, Paragraph, Table, TableStyle, Spacer, Image as ReportLabImage
from reportlab.lib.styles import getSampleStyleSheet

# Initialize Faker
fake = Faker()

# Ensure Tesseract is in PATH
pytesseract.pytesseract.tesseract_cmd = r'C:\Users\cha\AppData\Local\Programs\Tesseract-OCR\tesseract.exe'


In [2]:

# # Cell 2: Directory setup
SAMPLE_STATEMENTS_DIR = "sample_statements"
SAMPLE_LOGOS_DIR = "sample_logos"
SYNTHETIC_STATEMENTS_DIR = "synthetic_statements"

# Create directories if they don’t exist
for directory in [SAMPLE_STATEMENTS_DIR, SAMPLE_LOGOS_DIR, SYNTHETIC_STATEMENTS_DIR]:
    os.makedirs(directory, exist_ok=True)

# Bank-to-logo mapping
BANK_LOGO_MAPPING = {
    "wells fargo": "wells_fargo_logo.png",
    "chase": "chase_bank_logo.png",
    "bank of america": "bank_of_america_logo.png",
    "pnc": "pnc_bank_logo.png",
    "generic": "generic_logo.png"
}


In [3]:

# Cell 3: Pydantic model for transactions
class Transaction(BaseModel):
    description: str = Field(..., max_length=35, description="Transaction description, max 35 characters")
    category: str
    amount: float


In [4]:

# Cell 4: Generate category lists using LLM
def generate_category_lists() -> tuple[List[str], List[str]]:
    prompt = """
    Generate two lists of bank transaction categories in JSON format. One list for reasons someone loses money (e.g., utilities, subscriptions) and one for reasons someone gains money (e.g., gifts, deposits). Each list should have 5-7 unique categories, each 1-2 words, title case, no punctuation. Return as:
    {
      "loss_categories": ["Category One", "Category Two", ...],
      "gain_categories": ["Category One", "Category Two", ...]
    }
    """
    try:
        response = ollama.generate(model="mistral:7b-instruct-v0.3-q4_0", prompt=prompt)
        category_data = json.loads(response['response'].strip())
        loss_categories = category_data.get("loss_categories", [])
        gain_categories = category_data.get("gain_categories", [])
        loss_categories = [cat for cat in loss_categories if isinstance(cat, str) and 1 <= len(cat.split()) <= 2]
        gain_categories = [cat for cat in gain_categories if isinstance(cat, str) and 1 <= len(cat.split()) <= 2]
        if len(loss_categories) < 5 or len(gain_categories) < 5:
            raise ValueError("Insufficient valid categories")
    except (json.JSONDecodeError, ValueError):
        loss_categories = ["Utility Payment", "Subscription Fee", "Online Purchase", "Rent Payment", "Grocery Shopping", "Insurance Bill"]
        gain_categories = ["Salary Deposit", "Tax Refund", "Gift Received", "Client Payment", "Investment Return", "Cash Deposit"]
    return loss_categories, gain_categories


In [5]:

# Cell 5: Generate transaction description using LLM
def generate_transaction_description(amount: float, category: str) -> dict:
    prompt = f"""
    Generate a bank transaction description (3-5 words, max 25 characters) for a transaction in the '{category}' category.
    Rules:
    - Use title case (e.g., 'Grocery Store Purchase').
    - No punctuation (commas, periods, etc.).
    - No parentheses, dashes, or dollar signs.
    - No amounts or numbers as words.
    - No typos.
    - Use simple, clear phrases.
    - Examples: 'Grocery Store Purchase', 'Utility Bill Payment', 'Salary Direct Deposit'
    """
    try:
        response = ollama.generate(model="mistral:7b-instruct-v0.3-q4_0", prompt=prompt)
        description = response['response'].strip()[:25]
    except:
        description = f"{category} Transaction"  # Fallback
    description = description.replace("(", "").replace(")", "").replace(",", "").replace(":", "").replace("-", "").replace("$", "").replace(".", "")
    description = ' '.join(word.capitalize() for word in description.split())[:25]
    words = description.split()
    if len(words) < 3 or len(words) > 5:
        description = f"{category} Transaction"[:25]
    transaction = Transaction(description=description, category=category, amount=amount)
    return transaction.dict()


In [6]:
# Cell 6: Generate synthetic bank statement
def generate_bank_statement(num_transactions: int = 10, account_holder: str = "John Doe") -> pd.DataFrame:
    loss_categories, gain_categories = generate_category_lists()
    start_date = datetime.now() - timedelta(days=90)
    dates = [start_date + timedelta(days=random.randint(0, 90)) for _ in range(num_transactions)]
    transactions = []
    for _ in range(num_transactions):
        is_gain = random.choice([True, False])
        category = random.choice(gain_categories if is_gain else loss_categories)
        amount = round(random.uniform(50, 1000), 2) if is_gain else round(random.uniform(-500, -10), 2)
        transaction = generate_transaction_description(amount, category)
        transactions.append(transaction)
    data = {
        "Date": [d.strftime("%Y-%m-%d") for d in dates],
        "Description": [t["description"] for t in transactions],
        "Category": [t["category"] for t in transactions],
        "Amount": [t["amount"] for t in transactions],
        "Balance": [0.0] * num_transactions,
        "Account Holder": [account_holder] * num_transactions,
        "Transaction ID": [(fake.bban()[:10] + str(i).zfill(4)) for i in range(num_transactions)]
    }
    df = pd.DataFrame(data)
    df = df.sort_values("Date")
    initial_balance = 1000.0
    df["Balance"] = initial_balance + df["Amount"].cumsum()
    return df


In [7]:

# Cell 7: Normalize logo dimensions
def normalize_logo(logo_path: str, max_width: int = 150, max_height: int = 50) -> Image.Image:
    img = Image.open(logo_path)
    img = img.convert("RGB")  # Ensure compatibility with ReportLab
    aspect = img.height / img.width
    if img.width > max_width:
        img = img.resize((max_width, int(max_width * aspect)), Image.Resampling.LANCZOS)
    if img.height > max_height:
        img = img.resize((int(max_height / aspect), max_height), Image.Resampling.LANCZOS)
    return img


In [8]:
# Cell 8: Identify bank using LLM
def identify_bank(image_path: str) -> str:
    try:
        image = Image.open(image_path).convert('L')
        image = ImageEnhance.Contrast(image).enhance(3.0)  # Increased contrast
        image = image.filter(ImageFilter.SHARPEN)
        image = image.resize((1200, 1800), Image.Resampling.LANCZOS)  # Higher resolution
        # Apply thresholding for better text clarity
        image = image.point(lambda p: p > 128 and 255)  # Binary threshold
        text = pytesseract.image_to_string(image, config='--psm 3').lower()  # Default PSM for mixed text
        # Log OCR text for debugging
        with open(os.path.join(SYNTHETIC_STATEMENTS_DIR, f"ocr_{os.path.basename(image_path)}.txt"), 'w') as f:
            f.write(text)
        # Limit text to first 1000 characters to reduce LLM load
        text = text[:1000]
        prompt = f"""
        Given the following text extracted from a bank statement, identify the bank name. Return only the bank name in lowercase (e.g., 'wells fargo', 'chase', 'pnc', 'citibank') or 'generic' if the bank is not recognized. Do not include extra text or explanations. The text may be noisy or contain errors.
        Supported banks: wells fargo, chase, pnc, citibank.
        Text:
        {text}
        """
        response = ollama.generate(model="mistral:7b-instruct-v0.3-q4_0", prompt=prompt)
        bank = response['response'].strip().lower()
        if bank in BANK_LOGO_MAPPING:
            return bank.replace(" ", "_")
        return "generic"
    except Exception as e:
        print(f"Error in bank detection for {image_path}: {e}")
        return "generic"

In [9]:

# Cell 9: Get logo path
def get_logo_path(bank: str) -> str:
    logo_filename = BANK_LOGO_MAPPING.get(bank.replace("_", " "), "generic_logo.png")
    logo_path = os.path.join(SAMPLE_LOGOS_DIR, logo_filename)
    if os.path.exists(logo_path):
        return logo_path
    print(f"Logo not found for bank: {bank}")
    return None


In [10]:
# Cell 10: Process sample statement image (Updated)
def process_bank_statement_image(image_path: str) -> tuple[dict, dict, str]:
    image = Image.open(image_path).convert('L')
    
    # Enhanced preprocessing
    enhancer = ImageEnhance.Contrast(image)
    image = enhancer.enhance(3.0)  # Increase contrast
    image = image.filter(ImageFilter.MedianFilter())  # Reduce noise
    image = image.filter(ImageFilter.SHARPEN)  # Sharpen edges
    image = image.resize((1200, 1800), Image.Resampling.LANCZOS)  # Higher resolution
    
    # Adaptive thresholding for better text clarity
    image = image.point(lambda p: 255 if p > image.convert('L').point(lambda x: x).mean() else 0)
    
    text = pytesseract.image_to_string(image, config='--psm 6 --oem 3')  # PSM 6 for single uniform block, OEM 3 for default
    lines = text.split('\n')
    layout = {}
    current_section = None
    bank = identify_bank(image_path)
    
    # Map noisy section headers to standard ones
    section_mapping = {
        r'summary|account summary|checking summary': 'CHECKING SUMMARY',
        r'transaction|history|activity detail': 'TRANSACTION HISTORY',
        r'deposits|additions': 'DEPOSITS AND ADDITIONS',
        r'withdrawals|subtractions': 'WITHDRAWALS',
    }
    
    for line in lines:
        line = line.strip()
        if not line or len(line) < 5 or not any(c.isalpha() for c in line):
            continue
        line_upper = line.upper()
        matched = False
        for pattern, standard_name in section_mapping.items():
            if re.search(pattern, line_upper, re.IGNORECASE):
                current_section = standard_name
                layout[current_section] = []
                matched = True
                break
        if not matched and current_section:
            # Filter out obvious gibberish
            if re.search(r'^[A-Za-z]{2,}$', line) or len(line.split()) < 2:
                continue
            layout[current_section].append(line)
    
    # Detect placeholders with regex
    placeholders = {}
    for section, lines in layout.items():
        for line in lines:
            # Match amounts (e.g., $1,234.56 or 1234.56)
            amount_match = re.search(r'\$?-?\d{1,3}(,\d{3})*(\.\d{2})?', line)
            if amount_match:
                placeholders[f"{section}_AMOUNT"] = amount_match.group(0)
            # Match dates (e.g., MM/DD/YYYY or MM/DD/YY)
            date_match = re.search(r'\d{2}/\d{2}/\d{2,4}', line)
            if date_match:
                placeholders[f"{section}_DATE"] = date_match.group(0)
    
    # Load manual template if available, overriding OCR if present
    template_path = os.path.join(SAMPLE_STATEMENTS_DIR, f"{bank}_template.json")
    if os.path.exists(template_path):
        with open(template_path, 'r') as f:
            manual_template = json.load(f)
            layout.update(manual_template.get('layout', {}))
            placeholders.update(manual_template.get('placeholders', {}))
    
    # Log OCR text and results for debugging
    with open(os.path.join(SYNTHETIC_STATEMENTS_DIR, f"ocr_{os.path.basename(image_path)}.txt"), 'w') as f:
        f.write(f"Detected Bank: {bank}\n")
        f.write("Detected Layout:\n")
        for section, lines in layout.items():
            f.write(f"{section}: {lines}\n")
        f.write("Detected Placeholders:\n")
        for key, value in placeholders.items():
            f.write(f"{key}: {value}\n")
    
    return layout, placeholders, bank

In [11]:
import numpy as np

def process_bank_statement_image(image_path: str) -> tuple[dict, dict, str]:
    image = Image.open(image_path).convert('L')
    
    # Enhanced preprocessing
    enhancer = ImageEnhance.Contrast(image)
    image = enhancer.enhance(3.0)  # Increase contrast
    image = image.filter(ImageFilter.MedianFilter())  # Reduce noise
    image = image.filter(ImageFilter.SHARPEN)  # Sharpen edges
    image = image.resize((1200, 1800), Image.Resampling.LANCZOS)  # Higher resolution
    
    # Adaptive thresholding for better text clarity
    img_array = np.array(image)
    threshold = np.mean(img_array)
    image = image.point(lambda p: 255 if p > threshold else 0)
    
    text = pytesseract.image_to_string(image, config='--psm 6 --oem 3')  # PSM 6 for single uniform block, OEM 3 for default
    lines = text.split('\n')
    layout = {}
    current_section = None
    bank = identify_bank(image_path)
    
    # Updated section mapping to match Chase statement structure
    section_mapping = {
        r'summary|account summary|checking summary': 'CHECKING SUMMARY',
        r'deposits|additions': 'DEPOSITS AND ADDITIONS',
        r'withdrawals|subtractions|electronic withdrawals': 'WITHDRAWALS',
        r'daily ending balance|daily balance': 'DAILY ENDING BALANCE',
    }
    
    for line in lines:
        line = line.strip()
        if not line or len(line) < 5 or not any(c.isalpha() for c in line):
            continue
        line_upper = line.upper()
        matched = False
        for pattern, standard_name in section_mapping.items():
            if re.search(pattern, line_upper, re.IGNORECASE):
                current_section = standard_name
                layout[current_section] = []
                matched = True
                break
        if current_section and not matched:
            # Filter out gibberish and align with Chase format
            if re.search(r'^[A-Za-z]{2,}$', line) or len(line.split()) < 2:
                continue
            # Parse lines into key-value pairs or list items
            parts = line.split()
            if any(k in line_upper for k in ['BEGINNING', 'DEPOSITS', 'WITHDRAWALS', 'ENDING', 'DATE', 'AMOUNT']):
                layout[current_section].append(line)
    
    # Detect placeholders with regex, tailored to Chase format
    placeholders = {}
    for section, lines in layout.items():
        for line in lines:
            # Match amounts (e.g., $15,050.80 or -350.04)
            amount_match = re.search(r'\$?-?\d{1,3}(?:,\d{3})*(?:\.\d{2})?', line)
            if amount_match:
                placeholders[f"{section}_AMOUNT"] = amount_match.group(0)
            # Match dates (e.g., 09/15 or 03/01/2016)
            date_match = re.search(r'\d{2}/\d{2}(?:/\d{2,4})?', line)
            if date_match:
                placeholders[f"{section}_DATE"] = date_match.group(0)
            # Match instances (e.g., 2)
            instance_match = re.search(r'\d+', line)
            if instance_match and 'INSTANCES' in line_upper:
                placeholders[f"{section}_INSTANCES"] = instance_match.group(0)
    
    # Load manual template if available
    template_path = os.path.join(SAMPLE_STATEMENTS_DIR, f"{bank}_template.json")
    if os.path.exists(template_path):
        with open(template_path, 'r') as f:
            manual_template = json.load(f)
            layout.update(manual_template.get('layout', {}))
            placeholders.update(manual_template.get('placeholders', {}))
    
    # Log OCR text and results
    with open(os.path.join(SYNTHETIC_STATEMENTS_DIR, f"ocr_{os.path.basename(image_path)}.txt"), 'w') as f:
        f.write(f"Detected Bank: {bank}\n")
        f.write("Detected Layout:\n")
        for section, lines in layout.items():
            f.write(f"{section}: {lines}\n")
        f.write("Detected Placeholders:\n")
        for key, value in placeholders.items():
            f.write(f"{key}: {value}\n")
    
    return layout, placeholders, bank

In [12]:
def generate_populated_pdf(df: pd.DataFrame, account_holder: str, layout: dict, placeholders: dict, bank: str, output_filename: str) -> str:
    doc = SimpleDocTemplate(output_filename, pagesize=letter, rightMargin=0.5*inch, leftMargin=0.5*inch, topMargin=0.75*inch, bottomMargin=0.5*inch)
    elements = []
    styles = getSampleStyleSheet()
    heading_style = styles['Heading2']
    detail_style = styles['Normal']
    detail_style.fontSize = 10
    detail_style.leading = 12

    # Add logo if available
    logo_path = get_logo_path(bank)
    if logo_path:
        logo = normalize_logo(logo_path)
        logo_temp_path = os.path.join(SYNTHETIC_STATEMENTS_DIR, f"chase_example.png")
        logo.save(logo_temp_path)
        logo_image = ReportLabImage(logo_temp_path, width=150, height=50)  # Fixed dimensions to match Chase logo size
        elements.append(logo_image)
        elements.append(Spacer(1, 0.1*inch))

    # Add header matching Chase format
    elements.append(Paragraph(f"JPMorgan Chase Bank, N.A.", heading_style))
    elements.append(Paragraph(f"P.O. Box 659754, San Antonio, TX 78265-9754", detail_style))
    elements.append(Paragraph(f"{account_holder}", detail_style))
    elements.append(Paragraph(f"{min(df['Date']).replace('-', '/')} through {max(df['Date']).replace('-', '/')}", detail_style))
    elements.append(Paragraph(f"Account Number: {fake.bban()[:10]}", detail_style))
    elements.append(Spacer(1, 0.1*inch))

    # Add Checking Summary section
    if 'CHECKING SUMMARY' in layout:
        elements.append(Paragraph("CHECKING SUMMARY", heading_style))
        elements.append(Paragraph("Chase Business Select Checking", detail_style))
        elements.append(Spacer(1, 0.1*inch))
        summary_data = [["", "INSTANCES", "AMOUNT"]]
        initial_balance = 15050.80  # Example from Chase statement
        deposits_total = sum(x for x in df['Amount'] if x > 0)
        withdrawals_total = sum(x for x in df['Amount'] if x < 0)
        ending_balance = initial_balance + deposits_total + withdrawals_total
        summary_data.extend([
            ["Beginning Balance", 1, f"${initial_balance:,.2f}"],
            ["Deposits and Additions", len([x for x in df['Amount'] if x > 0]), f"${deposits_total:,.2f}"],
            ["Electronic Withdrawals", len([x for x in df['Amount'] if x < 0]), f"${abs(withdrawals_total):,.2f}"],
            ["Ending Balance", 1, f"${ending_balance:,.2f}"]
        ])
        summary_table = Table(summary_data, colWidths=[2.5*inch, 1*inch, 1.5*inch])
        summary_table.setStyle(TableStyle([
            ('GRID', (0, 0), (-1, -1), 0.25, colors.black),
            ('ALIGN', (0, 0), (-1, -1), 'LEFT'),
            ('ALIGN', (2, 1), (2, -1), 'RIGHT'),
            ('FONTSIZE', (0, 0), (-1, -1), 10),
            ('BACKGROUND', (0, 0), (-1, 0), colors.lightgrey),
            ('TEXTCOLOR', (0, 0), (-1, 0), colors.black),
        ]))
        elements.append(summary_table)
        elements.append(Paragraph("Your monthly service fee was waived because you maintained an average checking balance of $7,500.00 or a minimum checking balance of $5,000.00 or more during the statement period.", detail_style))
        elements.append(Spacer(1, 0.25*inch))

    # Add Deposits and Additions section
    if 'DEPOSITS AND ADDITIONS' in layout:
        elements.append(Paragraph("DEPOSITS AND ADDITIONS", heading_style))
        elements.append(Spacer(1, 0.1*inch))
        deposits_data = [["DATE", "DESCRIPTION", "AMOUNT"]]
        for _, row in df.iterrows():
            if row['Amount'] > 0:
                deposits_data.append([row['Date'].replace('-', '/'), row['Description'], f"${row['Amount']:,.2f}"])
        deposits_table = Table(deposits_data, colWidths=[1*inch, 2.5*inch, 1*inch])
        deposits_table.setStyle(TableStyle([
            ('GRID', (0, 0), (-1, -1), 0.25, colors.black),
            ('ALIGN', (0, 0), (-1, -1), 'LEFT'),
            ('ALIGN', (2, 1), (2, -1), 'RIGHT'),
            ('FONTSIZE', (0, 0), (-1, -1), 10),
            ('BACKGROUND', (0, 0), (-1, 0), colors.lightgrey),
            ('TEXTCOLOR', (0, 0), (-1, 0), colors.black),
        ]))
        elements.append(deposits_table)
        elements.append(Spacer(1, 0.25*inch))

    # Add Withdrawals section
    if 'WITHDRAWALS' in layout:
        elements.append(Paragraph("WITHDRAWALS", heading_style))
        elements.append(Spacer(1, 0.1*inch))
        withdrawals_data = [["DATE", "DESCRIPTION", "AMOUNT"]]
        for _, row in df.iterrows():
            if row['Amount'] < 0:
                withdrawals_data.append([row['Date'].replace('-', '/'), row['Description'], f"${abs(row['Amount']):,.2f}"])
        withdrawals_table = Table(withdrawals_data, colWidths=[1*inch, 2.5*inch, 1*inch])
        withdrawals_table.setStyle(TableStyle([
            ('GRID', (0, 0), (-1, -1), 0.25, colors.black),
            ('ALIGN', (0, 0), (-1, -1), 'LEFT'),
            ('ALIGN', (2, 1), (2, -1), 'RIGHT'),
            ('FONTSIZE', (0, 0), (-1, -1), 10),
            ('BACKGROUND', (0, 0), (-1, 0), colors.lightgrey),
            ('TEXTCOLOR', (0, 0), (-1, 0), colors.black),
        ]))
        elements.append(withdrawals_table)
        elements.append(Spacer(1, 0.25*inch))

    # Add Daily Ending Balance section
    if 'DAILY ENDING BALANCE' in layout:
        elements.append(Paragraph("DAILY ENDING BALANCE", heading_style))
        elements.append(Spacer(1, 0.1*inch))
        daily_balance_data = [["DATE", "AMOUNT"]]
        for _, row in df.iterrows():
            daily_balance_data.append([row['Date'].replace('-', '/'), f"${row['Balance']:,.2f}"])
        daily_balance_table = Table(daily_balance_data, colWidths=[1*inch, 1.5*inch])
        daily_balance_table.setStyle(TableStyle([
            ('GRID', (0, 0), (-1, -1), 0.25, colors.black),
            ('ALIGN', (0, 0), (-1, -1), 'LEFT'),
            ('ALIGN', (1, 1), (1, -1), 'RIGHT'),
            ('FONTSIZE', (0, 0), (-1, -1), 10),
            ('BACKGROUND', (0, 0), (-1, 0), colors.lightgrey),
            ('TEXTCOLOR', (0, 0), (-1, 0), colors.black),
        ]))
        elements.append(daily_balance_table)
        elements.append(Spacer(1, 0.25*inch))

    # Add Customer Service Information
    elements.append(Paragraph("CUSTOMER SERVICE INFORMATION", heading_style))
    elements.append(Paragraph("Web site: chase.com", detail_style))
    elements.append(Paragraph("Service Center: 1-800-242-7338", detail_style))
    elements.append(Paragraph("Hearing Impaired: 1-800-242-7383", detail_style))
    elements.append(Paragraph("Para Espanol: 1-888-622-4273", detail_style))
    elements.append(Paragraph("International Calls: 1-713-262-1679", detail_style))

    doc.build(elements)
    if logo_path and os.path.exists(logo_temp_path):
        os.remove(logo_temp_path)
    return output_filename

In [14]:
# Cell 12: Main execution with batch processing
if __name__ == "__main__":
    num_transactions = 12
    account_holder = "John Doe"
    valid_extensions = (".jpg", ".jpeg", ".png")
    
    for filename in os.listdir(SAMPLE_STATEMENTS_DIR):
        if filename.lower().endswith(valid_extensions):
            image_path = os.path.join(SAMPLE_STATEMENTS_DIR, filename)
            print(f"\nProcessing image: {filename}")
            
            df = generate_bank_statement(num_transactions, account_holder)
            csv_filename = os.path.join(
                SYNTHETIC_STATEMENTS_DIR,
                f"bank_statement_{account_holder.replace(' ', '_')}_{filename.rsplit('.', 1)[0]}.csv"
            )
            df.to_csv(csv_filename, index=False)
            print(f"CSV saved as: {csv_filename}")
            
            layout, placeholders, bank = process_bank_statement_image(image_path)
            print(f"Detected Bank: {bank}")
            print("Detected Layout:", layout)
            print("Detected Placeholders:", placeholders)
            
            pdf_filename = os.path.join(
                SYNTHETIC_STATEMENTS_DIR,
                f"bank_statement_{account_holder.replace(' ', '_')}_{bank}_{filename.rsplit('.', 1)[0]}.pdf"
            )
            generate_populated_pdf(df, account_holder, layout, placeholders, bank, pdf_filename)
            print(f"PDF saved as: {pdf_filename}")


Processing image: chase_example.png


C:\Users\cha\AppData\Local\Temp\ipykernel_16812\3215599953.py:25: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.11/migration/
  return transaction.dict()


CSV saved as: synthetic_statements\bank_statement_John_Doe_chase_example.csv
Detected Bank: generic
Detected Layout: {}
Detected Placeholders: {}
Logo not found for bank: generic
PDF saved as: synthetic_statements\bank_statement_John_Doe_generic_chase_example.pdf

Processing image: chase_statement.png
CSV saved as: synthetic_statements\bank_statement_John_Doe_chase_statement.csv
Detected Bank: generic
Detected Layout: {'DEPOSITS AND ADDITIONS': []}
Detected Placeholders: {}
Logo not found for bank: generic
PDF saved as: synthetic_statements\bank_statement_John_Doe_generic_chase_statement.pdf

Processing image: pnc_statement.png
CSV saved as: synthetic_statements\bank_statement_John_Doe_pnc_statement.csv
Detected Bank: pnc
Detected Layout: {}
Detected Placeholders: {}
PDF saved as: synthetic_statements\bank_statement_John_Doe_pnc_pnc_statement.pdf

Processing image: unidentified_sample.png




CSV saved as: synthetic_statements\bank_statement_John_Doe_unidentified_sample.csv
Detected Bank: generic
Detected Layout: {'CHECKING SUMMARY': [], 'DEPOSITS AND ADDITIONS': [], 'WITHDRAWALS': []}
Detected Placeholders: {}
Logo not found for bank: generic
PDF saved as: synthetic_statements\bank_statement_John_Doe_generic_unidentified_sample.pdf

Processing image: wells_fargo_statement.png
CSV saved as: synthetic_statements\bank_statement_John_Doe_wells_fargo_statement.csv
Detected Bank: wells_fargo
Detected Layout: {}
Detected Placeholders: {}
PDF saved as: synthetic_statements\bank_statement_John_Doe_wells_fargo_wells_fargo_statement.pdf


Below sections is the same as the above but testing with just Chase because there were formatting issues when testing with multiple banks


In [5]:
# Cell 1: Import libraries
import os
import json
import re
from PIL import Image, ImageFilter, ImageEnhance
import pytesseract
import pandas as pd
import numpy as np
from faker import Faker
from datetime import datetime, timedelta
import random
import ollama
from pydantic import BaseModel, Field
from typing import List, Dict
from reportlab.lib import colors
from reportlab.lib.pagesizes import letter
from reportlab.lib.units import inch
from reportlab.platypus import SimpleDocTemplate, Paragraph, Table, TableStyle, Spacer, Image as ReportLabImage
from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
from reportlab.lib.enums import TA_CENTER, TA_RIGHT

# Initialize Faker
fake = Faker()

# Ensure Tesseract is in PATH
pytesseract.pytesseract.tesseract_cmd = r'C:\Users\cha\AppData\Local\Programs\Tesseract-OCR\tesseract.exe'

# Cell 2: Directory setup
SAMPLE_STATEMENT_DIR = "sample_statements"
SAMPLE_LOGOS_DIR = "sample_logos"
SYNTHETIC_STAT_DIR = "synthetic_statements"

# Create directories if they don’t exist
for directory in [SAMPLE_STATEMENT_DIR, SAMPLE_LOGOS_DIR, SYNTHETIC_STAT_DIR]:
    os.makedirs(directory, exist_ok=True)

# Chase-specific logo
BANK_LOGO = "chase_bank_logo.png"
BANK_NAME = "chase"

# Cell 3: Pydantic model for transactions
class Transaction(BaseModel):
    description: str = Field(..., max_length=35, description="Transaction description")
    category: str
    amount: float

# Cell 4: Generate category lists
def generate_category_lists() -> tuple[List[str], List[str]]:
    prompt = """
    Generate two lists of bank transaction categories in JSON format for Chase bank statements.
    One list for reasons someone loses money (e.g., utilities, subscriptions) and one for reasons someone gains money (e.g., deposits, refunds).
    Each list should have 5 unique categories, each 1-2 words, title case, no punctuation.
    Return:
    {
      "loss_categories": ["Category One", "Category Two", ...],
      "gain_categories": ["Category One", "Category Two", ...]
    }
    """
    try:
        response = ollama.generate(model="mistral:7b-instruct-v0.3-q4_0", prompt=prompt)
        category_data = json.loads(response['response'].strip())
        loss_categories = [cat for cat in category_data.get("loss_categories", []) if isinstance(cat, str) and 1 <= len(cat.split()) <= 2]
        gain_categories = [cat for cat in category_data.get("gain_categories", []) if isinstance(cat, str) and 1 <= len(cat.split()) <= 2]
        if len(loss_categories) < 5 or len(gain_categories) < 5:
            raise ValueError("Insufficient valid categories")
    except (json.JSONDecodeError, ValueError):
        loss_categories = ["Utility Payment", "Subscription Fee", "Online Purchase", "Rent Payment", "Grocery Shopping"]
        gain_categories = ["Salary Deposit", "Tax Refund", "Gift Received", "Client Payment", "Cash Deposit"]
    return loss_categories, gain_categories

# Cell 5: Generate transaction description
def generate_transaction_description(amount: float, category: str) -> dict:
    prompt = f"""
    Generate a bank transaction description (3-5 words, max 25 characters) for a Chase bank transaction in the '{category}' category.
    Rules:
    - Use title case.
    - No punctuation.
    - No parentheses, dashes, or dollar signs.
    - No amounts or numbers as words.
    - Use simple phrases.
    - Examples: 'Grocery Store Purchase', 'Utility Bill Payment'
    """
    try:
        response = ollama.generate(model="mistral:7b-instruct-v0.3-q4_0", prompt=prompt)
        description = response['response'].strip()[:25]
    except:
        description = f"{category} Transaction"
    description = description.replace("(", "").replace(")", "").replace(",", "").replace(":", "").replace("-", "").replace("$", "").replace(".", "")
    description = ' '.join(word.capitalize() for word in description.split())[:25]
    words = description.split()
    if len(words) < 3 or len(words) > 5:
        description = f"{category} Transaction"[:25]
    transaction = Transaction(description=description, category=category, amount=amount)
    return transaction.dict()

# Cell 6: Generate synthetic bank statement
from datetime import datetime, timedelta

def generate_bank_statement(num_transactions: int, account_holder: str) -> pd.DataFrame:
    data = []
    start_date = datetime.now() - timedelta(days=30)
    balance = 15050.80
    
    for i in range(num_transactions):
        date = start_date + timedelta(days=i)
        amount = random.uniform(-500, 500)
        balance += amount
        description = f"Transaction {i+1}"
        data.append([date, description, amount, balance])
    
    df = pd.DataFrame(data, columns=['Date', 'Description', 'Amount', 'Balance'])
    return df

# Cell 7: Normalize logo dimensions
def normalize_logo(logo_path: str, max_width: int = 150, max_height: int = 50) -> Image.Image:
    img = Image.open(logo_path).convert("RGB")
    aspect = img.height / img.width
    if img.width > max_width:
        img = img.resize((max_width, int(max_width * aspect)), Image.Resampling.LANCZOS)
    if img.height > max_height:
        img = img.resize((int(max_height / aspect), max_height), Image.Resampling.LANCZOS)
    return img

# Cell 8: Process Chase statement image
def process_bank_statement_image(image_path: str) -> tuple[dict, dict]:
    image = Image.open(image_path).convert('L')
    image = ImageEnhance.Contrast(image).enhance(3.0)
    image = image.filter(ImageFilter.MedianFilter())
    image = image.filter(ImageFilter.SHARPEN)
    image = image.resize((1200, 1800), Image.Resampling.LANCZOS)
    img_array = np.array(image)
    threshold = np.mean(img_array)
    image = image.point(lambda p: 255 if p > threshold else 0)
    
    text = pytesseract.image_to_string(image, config='--psm 6 --oem 3')
    lines = text.split('\n')
    layout: Dict[str, List[str]] = {}
    current_section = None
    
    section_mapping = {
        r'summary|checking summary': 'CHECKING SUMMARY',
        r'deposits|additions': 'DEPOSITS AND ADDITIONS',
        r'withdrawals|electronic withdrawals': 'WITHDRAWALS',
        r'daily ending balance': 'DAILY ENDING BALANCE',
    }
    
    for line in lines:
        line = line.strip()
        if not line or len(line) < 5 or not any(c.isalpha() for c in line):
            continue
        line_upper = line.upper()
        matched = False
        for pattern, standard_name in section_mapping.items():
            if re.search(pattern, line_upper, re.IGNORECASE):
                current_section = standard_name
                layout[current_section] = []
                matched = True
                break
        if current_section and not matched:
            if re.search(r'^[A-Za-z]{2,}$', line) or len(line.split()) < 2:
                continue
            layout[current_section].append(line)
    
    placeholders = {}
    for section, lines in layout.items():
        for line in lines:
            amount_match = re.search(r'\$?-?\d{1,3}(?:,\d{3})*(?:\.\d{2})?', line)
            if amount_match:
                placeholders[f"{section}_AMOUNT"] = amount_match.group(0)
            date_match = re.search(r'\d{2}/\d{2}(?:/\d{2,4})?', line)
            if date_match:
                placeholders[f"{section}_DATE"] = date_match.group(0)
            instance_match = re.search(r'\d+', line)
            if instance_match and 'INSTANCES' in line_upper:
                placeholders[f"{section}_INSTANCES"] = instance_match.group(0)
    
    # Ensure all required sections are present
    required_sections = ['CHECKING SUMMARY', 'DEPOSITS AND ADDITIONS', 'WITHDRAWALS', 'DAILY ENDING BALANCE']
    for section in required_sections:
        if section not in layout:
            layout[section] = []
    
    template_path = os.path.join(SAMPLE_STATEMENT_DIR, "chase_template.json")
    if os.path.exists(template_path):
        with open(template_path, 'r') as f:
            manual_template = json.load(f)
            layout.update(manual_template.get('layout', {}))
            placeholders.update(manual_template.get('placeholders', {}))
    
    log_path = os.path.join(SYNTHETIC_STAT_DIR, f"ocr_chase_example.txt")
    with open(log_path, 'w') as f:
        f.write(f"Detected Bank: {BANK_NAME}\n")
        f.write("Detected Layout:\n")
        for section, lines in layout.items():
            f.write(f"{section}: {lines}\n")
        f.write("Detected Placeholders:\n")
        for key, value in placeholders.items():
            f.write(f"{key}: {value}\n")
    
    return layout, placeholders




In [62]:


# Cell 9: Generate populated PDF with Chase-like styling and corrected layout
def generate_populated_pdf(df: pd.DataFrame, account_holder: str, layout: dict, placeholders: dict, output_filename: str) -> str:
    doc = SimpleDocTemplate(output_filename, pagesize=letter, rightMargin=0.25*inch, leftMargin=0.25*inch, topMargin=0.5*inch, bottomMargin=0.25*inch)
    elements = []
    styles = getSampleStyleSheet()
    
    # Convert Date column to datetime if it’s not already
    df['Date'] = pd.to_datetime(df['Date'])
    
    # Custom styles
    title_style = ParagraphStyle(
        name='TitleStyle',
        parent=styles['Heading1'],
        fontSize=8,
        textColor=colors.black,
        alignment=TA_CENTER,
        spaceAfter=2
    )
    heading_style = ParagraphStyle(
        name='HeadingStyle',
        parent=styles['Heading2'],
        fontSize=8,
        textColor=colors.black,
        spaceAfter=2
    )
    detail_style = ParagraphStyle(
        name='DetailStyle',
        parent=styles['Normal'],
        fontSize=8,
        leading=10,
        spaceAfter=0
    )
    note_style = ParagraphStyle(
        name='NoteStyle',
        parent=styles['Normal'],
        fontSize=7,
        leading=9,
        spaceAfter=4
    )
    amount_style = ParagraphStyle(
        name='AmountStyle',
        parent=styles['Normal'],
        fontSize=10,
        alignment=TA_RIGHT,
        leading=12
    )

    # Initialize logo_temp_path
    logo_temp_path = None

    # Create a two-column table for header (logo centered above text, customer service on right)
    header_table_data = [
        [
            # Left column (logo and bank info)
            [
                Paragraph("", detail_style),  # Placeholder for logo centering
                ReportLabImage(os.path.join(SAMPLE_LOGOS_DIR, BANK_LOGO), width=100, height=40) if os.path.exists(os.path.join(SAMPLE_LOGOS_DIR, BANK_LOGO)) else "",
                Paragraph("JPMorgan Chase Bank, N.A.", detail_style),
                Paragraph("P.O. Box 659754, San Antonio, TX 78265-9754", detail_style),
                Paragraph(account_holder, detail_style),
                Paragraph(f"{min(df['Date']).strftime('%m/%d/%Y')} through {max(df['Date']).strftime('%m/%d/%Y')}", detail_style),
                Paragraph(f"Account Number: {fake.bban()[:10]}", detail_style)
            ],
            # Right column (customer service info)
            [
                Paragraph("_________________________________", heading_style),
                Paragraph("CUSTOMER SERVICE INFORMATION", heading_style),
                Paragraph("_________________________________", heading_style),
                Paragraph("Web site: chase.com", detail_style),
                Paragraph("Service Center: 1-800-242-7338", detail_style),
                Paragraph("Hearing Impaired: 1-800-242-7383", detail_style),
                Paragraph("Para Espanol: 1-888-622-4273", detail_style),
                Paragraph("International Calls: 1-713-262-1679", detail_style)
            ]
        ]
    ]
    header_table = Table(header_table_data, colWidths=[5*inch, 2.8*inch])
    header_table.setStyle(TableStyle([
        ('VALIGN', (0, 0), (-1, -1), 'TOP'),
        ('LEFTPADDING', (0, 0), (0, -1), 0),
        ('RIGHTPADDING', (1, 0), (1, -1), 0),
        ('TOPPADDING', (0, 0), (-1, -1), 0),
        ('BOTTOMPADDING', (0, 0), (-1, -1), 0),
        ('ALIGN', (1, 0), (1, -1), 'RIGHT'),
    ]))
    elements.append(header_table)
    elements.append(Spacer(1, 0.1*inch))



#     # Title
#     title_table_data = [
#     [Paragraph("CHECKING SUMMARY", heading_style), Paragraph("Chase Business Select Checking", title_style) ]
# ]
#     title_table = Table(title_table_data, colWidths=[2.5*inch, 2.5*inch])
#     title_table.setStyle(TableStyle([('VALIGN', (0, 0), (-1, -1), 'MIDDLE')]))
#     elements.append(title_table)
#     elements.append(Spacer(1, 0.05*inch))
    

    # Checking Summary
    title_table_data = [
        [Paragraph("CHECKING SUMMARY", heading_style), Paragraph("Chase Business Select Checking", title_style)]
    ]
    title_table = Table(title_table_data, colWidths=[2*inch, 3*inch])
    title_table.setStyle(TableStyle([('VALIGN', (0, 0), (-1, -1), 'MIDDLE')]))
    elements.append(title_table)

    initial_balance = 15050.80
    deposits_total = sum(x for x in df['Amount'] if x > 0)
    withdrawals_total = abs(sum(x for x in df['Amount'] if x < 0))
    ending_balance = initial_balance + deposits_total - withdrawals_total
    summary_data = [["", "INSTANCES", "AMOUNT"]]
    summary_data.extend([
        ["Beginning Balance", 1, f"${initial_balance:,.2f}"],
        ["Deposits and Additions", len([x for x in df['Amount'] if x > 0]), f"${deposits_total:,.2f}"],
        ["Electronic Withdrawals", len([x for x in df['Amount'] if x < 0]), f"${withdrawals_total:,.2f}"],
        ["Ending Balance", 1, f"${ending_balance:,.2f}"]
    ])
    summary_table = Table(summary_data, colWidths=[2.5*inch, 1*inch, 1.5*inch])
    summary_table.setStyle(TableStyle([
        ('GRID', (0, 0), (-1, -1), 0, colors.black),  # Set grid line width to 0 for transparency
        ('BACKGROUND', (0, 0), (-1, 0), colors.white),  # Remove background color (use white or omit)
        ('TEXTCOLOR', (0, 0), (-1, 0), colors.black),
        ('ALIGN', (0, 0), (-1, -1), 'LEFT'),
        ('ALIGN', (2, 1), (2, -1), 'RIGHT'),
        ('FONTNAME', (0, 0), (-1, 0), 'Helvetica-Bold'),
        ('FONTSIZE', (0, 0), (-1, -1), 10),
        ('VALIGN', (0, 0), (-1, -1), 'MIDDLE'),
    ]))
    elements.append(summary_table)
    elements.append(Paragraph("Your monthly service fee was waived because you maintained an average checking balance of $7,500.00 or a minimum checking balance of $5,000.00 or more during the statement period.", note_style))
    elements.append(Spacer(1, 0.1*inch))

    # Deposits and Additions
    if any(x > 0 for x in df['Amount']):
        elements.append(Paragraph("DEPOSITS AND ADDITIONS", heading_style))
        deposits_data = [["DATE", "DESCRIPTION", "AMOUNT"]]
        for _, row in df.iterrows():
            if row['Amount'] > 0:
                deposits_data.append([row['Date'].strftime('%m/%d/%Y'), row['Description'], f"${row['Amount']:,.2f}"])
        deposits_table = Table(deposits_data, colWidths=[0.8*inch, 2.7*inch, 1*inch])
        deposits_table.setStyle(TableStyle([
            ('GRID', (0, 0), (-1, -1), 0, colors.black),  # Transparent grid lines
            ('BACKGROUND', (0, 0), (-1, 0), colors.white),  # No background
            ('TEXTCOLOR', (0, 0), (-1, 0), colors.black),
            ('ALIGN', (0, 0), (-1, -1), 'LEFT'),
            ('ALIGN', (2, 1), (2, -1), 'RIGHT'),
            ('FONTNAME', (0, 0), (-1, 0), 'Helvetica-Bold'),
            ('FONTSIZE', (0, 0), (-1, -1), 10),
            ('VALIGN', (0, 0), (-1, -1), 'MIDDLE'),
        ]))
        elements.append(deposits_table)
        elements.append(Spacer(1, 0.1*inch))

    # Withdrawals
    if any(x < 0 for x in df['Amount']):
        elements.append(Paragraph("WITHDRAWALS", heading_style))
        withdrawals_data = [["DATE", "DESCRIPTION", "AMOUNT"]]
        for _, row in df.iterrows():
            if row['Amount'] < 0:
                withdrawals_data.append([row['Date'].strftime('%m/%d/%Y'), row['Description'], f"${abs(row['Amount']):,.2f}"])
        withdrawals_table = Table(withdrawals_data, colWidths=[0.8*inch, 2.7*inch, 1*inch])
        withdrawals_table.setStyle(TableStyle([
            ('GRID', (0, 0), (-1, -1), 0, colors.black),  # Transparent grid lines
            ('BACKGROUND', (0, 0), (-1, 0), colors.white),  # No background
            ('TEXTCOLOR', (0, 0), (-1, 0), colors.black),
            ('ALIGN', (0, 0), (-1, -1), 'LEFT'),
            ('ALIGN', (2, 1), (2, -1), 'RIGHT'),
            ('FONTNAME', (0, 0), (-1, 0), 'Helvetica-Bold'),
            ('FONTSIZE', (0, 0), (-1, -1), 10),
            ('VALIGN', (0, 0), (-1, -1), 'MIDDLE'),
        ]))
        elements.append(withdrawals_table)
        elements.append(Spacer(1, 0.1*inch))

    # Daily Ending Balance
    elements.append(Paragraph("DAILY ENDING BALANCE", heading_style))
    daily_balance_data = [["DATE", "AMOUNT"]]
    balance_dict = {}
    for _, row in df.iterrows():
        date = row['Date'].strftime('%m/%d/%Y')
        balance_dict[date] = row['Balance']
    for date in sorted(balance_dict.keys()):
        daily_balance_data.append([date, f"${balance_dict[date]:,.2f}"])
    daily_balance_table = Table(daily_balance_data, colWidths=[0.8*inch, 1.5*inch])
    daily_balance_table.setStyle(TableStyle([
        ('GRID', (0, 0), (-1, -1), 0, colors.black),  # Transparent grid lines
        ('BACKGROUND', (0, 0), (-1, 0), colors.white),  # No background
        ('TEXTCOLOR', (0, 0), (-1, 0), colors.black),
        ('ALIGN', (0, 0), (-1, -1), 'LEFT'),
        ('ALIGN', (1, 1), (1, -1), 'RIGHT'),
        ('FONTNAME', (0, 0), (-1, 0), 'Helvetica-Bold'),
        ('FONTSIZE', (0, 0), (-1, -1), 10),
        ('VALIGN', (0, 0), (-1, -1), 'MIDDLE'),
    ]))
    elements.append(daily_balance_table)
    elements.append(Spacer(1, 0.1*inch))

    # Build PDF
    try:
        doc.build(elements)
    except Exception as e:
        print(f"Error building PDF: {e}")
        raise
    finally:
        if logo_temp_path is not None and os.path.exists(logo_temp_path):
            os.remove(logo_temp_path)
    
    return output_filename

In [63]:


# Cell 10: Main execution
if __name__ == "__main__":
    num_transactions = 2
    account_holder = "John Doe"
    valid_extensions = (".jpg", ".jpeg", ".png")
    df = generate_bank_statement(num_transactions, account_holder)
    
    filename = None
    for ext in valid_extensions:
        if os.path.exists(os.path.join(SAMPLE_STATEMENT_DIR, f"chase_example{ext}")):
            filename = f"chase_example{ext}"
            break
    
    if filename:
        image_path = os.path.join(SAMPLE_STATEMENT_DIR, filename)
        print(f"\nProcessing image: {filename}")
        
        df = generate_bank_statement(num_transactions, account_holder)
        csv_filename = os.path.join(SYNTHETIC_STAT_DIR, f"bank_statement_{account_holder.replace(' ', '_')}_chase.csv")
        df.to_csv(csv_filename, index=False)
        print(f"CSV saved as: {csv_filename}")
        
        layout, placeholders = process_bank_statement_image(image_path)
        print(f"Detected Bank: {BANK_NAME}")
        print("Detected Layout:", layout)
        print("Detected Placeholders:", placeholders)
        
        pdf_filename = os.path.join(SYNTHETIC_STAT_DIR, f"bank_statement_{account_holder.replace(' ', '_')}_chase.pdf")
        generate_populated_pdf(df, account_holder, layout, placeholders, pdf_filename)
        print(f"PDF saved as: {pdf_filename}")
    else:
        print("Chase example file not found in sample_statements directory.")


Processing image: chase_example.png
CSV saved as: synthetic_statements\bank_statement_John_Doe_chase.csv
Detected Bank: chase
Detected Layout: {'CHECKING SUMMARY': [], 'DEPOSITS AND ADDITIONS': [], 'WITHDRAWALS': [], 'DAILY ENDING BALANCE': []}
Detected Placeholders: {}
PDF saved as: synthetic_statements\bank_statement_John_Doe_chase.pdf
