In [1]:
# Cell 1: Import libraries
import os
import json
import re
import base64  # New import for base64 encoding
from faker import Faker
from datetime import datetime, timedelta
import random
import pandas as pd
import ollama
from pydantic import BaseModel, Field
from typing import List, Dict
from jinja2 import Environment, FileSystemLoader
import pdfkit


# Initialize Faker
fake = Faker()


In [2]:
# Cell 2: Directory setup and bank configuration
SAMPLE_LOGOS_DIR = "sample_logos"
SYNTHETIC_STAT_DIR = "new_statements"
TEMPLATES_DIR = "templates"

# Create directories if they don’t exist
for directory in [SAMPLE_LOGOS_DIR, SYNTHETIC_STAT_DIR, TEMPLATES_DIR]:
    os.makedirs(directory, exist_ok=True)

# Bank configuration: map bank names to logos, templates, and specific account names
BANK_CONFIG = {
    "chase": {
        "logo": "chase_bank_logo.png",
        "templates": ["chase_classic_style.html", "chase_variation_1.html", "chase_variation_2.html"],
        "account_types": {
            "personal": "Chase Total Checking",
            "business": "Chase Business Complete Checking"
        }
    },
    "citibank": {
        "logo": "citibank_logo.png",
        "templates": ["citibank_classic_template.html", "citibank_variation_1.html", "citibank_variation_2.html"], 
        "account_types": {
            "personal": "Citi Access Checking",
            "business": "CitiBusiness Checking"
        }
    },
    "wellsfargo": {
        "logo": "wellsfargo_logo.png",
        "templates": ["wells_fargo_classic.html", "wells_variation_1.html", "wells_variation_2.html"],  
        "account_types": {
            "personal": "Everyday Checking",
            "business": "Initiate Business Checking"
        }
    },
    "pnc": {
        "logo": "pnc_logo.png",
        "templates": ["pnc_classic.html"],
        "account_types": {
            "personal": "Standard Checking",
            "business": "Business Checking"
        }
    }
}

# Validate bank configuration
for bank, config in BANK_CONFIG.items():
    logo_path = os.path.join(SAMPLE_LOGOS_DIR, config["logo"])
    for template in config["templates"]:
        template_path = os.path.join(TEMPLATES_DIR, template)
        if not os.path.exists(template_path):
            raise FileNotFoundError(f"Template file not found for {bank}: {template_path}")
    if not os.path.exists(logo_path):
        raise FileNotFoundError(f"Logo file not found for {bank}: {logo_path}")

In [3]:
# Cell 3: Pydantic models
class FieldDefinition(BaseModel):
    name: str = Field(..., description="Field name (e.g., account_holder, account_number)")
    is_mutable: bool = Field(..., description="Whether the field is mutable")
    description: str = Field(..., description="Description of the field")

class StatementFields(BaseModel):
    fields: List[FieldDefinition] = Field(..., description="List of mutable and immutable fields")

class Transaction(BaseModel):
    description: str = Field(..., max_length=35, description="Transaction description")
    category: str
    amount: float
    account_type: str = Field(..., description="Type of account (business or personal)")

In [4]:
# Cell 4: Generate category lists
def generate_category_lists(account_type: str) -> tuple[List[str], List[str]]:
    account_context = "business" if account_type == "business" else "personal"
    prompt = f"""
    Generate two lists of bank transaction categories in JSON format for {account_context} bank statements.
    One list for reasons someone loses money (e.g., {'vendor payments, payroll' if account_type == 'business' else 'utilities, subscriptions'}) and one for reasons someone gains money (e.g., {'client invoices, refunds' if account_type == 'business' else 'deposits, refunds'}).
    Each list should have 5 unique categories, each 1-2 words, title case, no punctuation.
    Return:
    {{
      "loss_categories": ["Category One", "Category Two", ...],
      "gain_categories": ["Category One", "Category Two", ...]
    }}
    """
    try:
        response = ollama.generate(model="mistral:7b-instruct-v0.3-q4_0", prompt=prompt)
        category_data = json.loads(response['response'].strip())
        loss_categories = [cat for cat in category_data.get("loss_categories", []) if isinstance(cat, str) and 1 <= len(cat.split()) <= 2]
        gain_categories = [cat for cat in category_data.get("gain_categories", []) if isinstance(cat, str) and 1 <= len(cat.split()) <= 2]
        if len(loss_categories) < 5 or len(gain_categories) < 5:
            raise ValueError("Insufficient valid categories")
    except (json.JSONDecodeError, ValueError):
        if account_type == "business":
            loss_categories = ["Vendor Payment", "Payroll Expense", "Office Supplies", "Equipment Purchase", "Marketing Cost"]
            gain_categories = ["Client Invoice", "Refund Received", "Investment Income", "Grant Received", "Sales Revenue"]
        else:  # personal
            loss_categories = ["Utility Payment", "Subscription Fee", "Online Purchase", "Rent Payment", "Grocery Shopping"]
            gain_categories = ["Salary Deposit", "Tax Refund", "Gift Received", "Client Payment", "Cash Deposit"]
    return loss_categories, gain_categories

In [5]:
# Cell 5: Generate transaction description
def generate_transaction_description(amount: float, category: str, account_type: str) -> dict:
    account_context = "business" if account_type == "business" else "personal"
    prompt = f"""
    Generate a bank transaction description (3-5 words, max 45 characters) for a {account_context} bank transaction in the '{category}' category.
    Rules:
    - Use title case.
    - No punctuation.
    - No parentheses, dashes, or dollar signs.
    - No amounts or numbers as words.
    - Use simple phrases relevant to {account_context} accounts.
    - Examples: {'Office Supply Purchase' if account_type == 'business' else 'Grocery Store Purchase'}, {'Vendor Invoice Payment' if account_type == 'business' else 'Utility Bill Payment'}
    """
    try:
        response = ollama.generate(model="mistral:7b-instruct-v0.3-q4_0", prompt=prompt)
        description = response['response'].strip()[:25]
    except:
        description = f"{category} Transaction"
    description = description.replace("(", "").replace(")", "").replace(",", "").replace(":", "").replace("-", "").replace("$", "").replace(".", "")
    description = ' '.join(word.capitalize() for word in description.split())[:25]
    words = description.split()
    if len(words) < 3 or len(words) > 5:
        description = f"{category} Transaction"[:45]
    transaction = Transaction(description=description, category=category, amount=amount, account_type=account_type)
    return transaction.model_dump()

In [6]:
# Cell 6: Generate synthetic bank statement
def generate_bank_statement(num_transactions: int, account_holder: str, account_type: str) -> pd.DataFrame:
    if account_type not in ["business", "personal"]:
        raise ValueError("Account type must be 'business' or 'personal'")
    loss_categories, gain_categories = generate_category_lists(account_type)
    start_date = datetime.now() - timedelta(days=30)
    dates = [start_date + timedelta(days=random.randint(0, 30)) for _ in range(num_transactions)]
    transactions = []
    for _ in range(num_transactions):
        is_gain = random.choice([True, False])
        category = random.choice(gain_categories if is_gain else loss_categories)
        amount = round(random.uniform(50, 1000), 2) if is_gain else round(random.uniform(-500, -10), 2)
        transaction = generate_transaction_description(amount, category, account_type)
        transactions.append(transaction)
    data = {
        "Date": [d.strftime("%m/%d") for d in dates],
        "Description": [t["description"] for t in transactions],
        "Category": [t["category"] for t in transactions],
        "Amount": [t["amount"] for t in transactions],
        "Balance": [0.0] * num_transactions,
        "Account Holder": [account_holder] * num_transactions,
        "Account Type": [account_type.capitalize()] * num_transactions,
        "Transaction ID": [(fake.bban()[:10] + str(i).zfill(4)) for i in range(num_transactions)]
    }
    df = pd.DataFrame(data)
    df = df.sort_values("Date")
    initial_balance = round(random.uniform(1000, 20000), 2)
    df["Balance"] = initial_balance + df["Amount"].cumsum()
    return df

In [7]:
# Cell 7: Identify mutable and immutable fields using Ollama
def identify_template_fields(bank: str, templates_dir: str = TEMPLATES_DIR) -> StatementFields:
    if bank not in BANK_CONFIG:
        raise ValueError(f"Unsupported bank: {bank}. Supported banks: {list(BANK_CONFIG.keys())}")
    
    template_path = os.path.join(templates_dir, BANK_CONFIG[bank]["templates"][0])
    if not os.path.exists(template_path):
        raise FileNotFoundError(f"Template file not found: {template_path}")
    
    with open(template_path, 'r', encoding='utf-8') as f:
        template_content = f.read()
    
    placeholders = re.findall(r'\{\{([^{}]+)\}\}', template_content)
    placeholders = [p.strip() for p in placeholders]
    
    prompt = f"""
    Given the following list of placeholders extracted from a {bank.capitalize()} bank statement HTML template, classify each as mutable (can be changed with synthetic data) or immutable (static, e.g., bank details or table headers). Return a JSON object with a list of fields, each containing the field name, whether it is mutable (true/false), and a brief description of what the field represents. Example:
    {{
        "fields": [
            {{"name": "account_holder", "is_mutable": true, "description": "Name of the account holder"}},
            {{"name": "bank_name", "is_mutable": false, "description": "Name of the bank"}}
        ]
    }}
    Placeholders:
    {', '.join(placeholders)}
    Rules:
    - Mutable fields include account holder, client number, date of birth, account number, IBAN, statement period, statement date, transactions, opening balance, total debit, total credit, total balance, and other customer-specific data.
    - Immutable fields include bank name, bank address, table headers, customer service information, and footnotes.
    - Ensure descriptions are relevant to {bank.capitalize()} bank statements.
    """
    try:
        response = ollama.generate(model="mistral:7b-instruct-v0.3-q4_0", prompt=prompt)
        fields_data = json.loads(response['response'].strip())
        statement_fields = StatementFields(**fields_data)
    except (json.JSONDecodeError, ValueError, ollama.RequestError) as e:
        print(f"Ollama failed for {bank}: {e}. Using fallback fields.")
        default_fields = [
            FieldDefinition(name="account_holder", is_mutable=True, description="Name of the account holder"),
            FieldDefinition(name="client_number", is_mutable=True, description="Client identification number"),
            FieldDefinition(name="date_of_birth", is_mutable=True, description="Customer's date of birth"),
            FieldDefinition(name="customer_account_number", is_mutable=True, description="Customer's account number"),
            FieldDefinition(name="customer_iban", is_mutable=True, description="Customer's IBAN"),
            FieldDefinition(name="customer_bank_name", is_mutable=True, description="Name of the customer's bank"),
            FieldDefinition(name="statement_period", is_mutable=True, description="Statement date range"),
            FieldDefinition(name="statement_date", is_mutable=True, description="Date the statement was created"),
            FieldDefinition(name="opening_balance_debit", is_mutable=True, description="Opening balance debit amount"),
            FieldDefinition(name="opening_balance_credit", is_mutable=True, description="Opening balance credit amount"),
            FieldDefinition(name="opening_balance", is_mutable=True, description="Opening balance total"),
            FieldDefinition(name="transactions", is_mutable=True, description="List of transaction details"),
            FieldDefinition(name="total_debit", is_mutable=True, description="Total debit amount"),
            FieldDefinition(name="total_credit", is_mutable=True, description="Total credit amount"),
            FieldDefinition(name="total", is_mutable=True, description="Total balance"),
            FieldDefinition(name="bank_name", is_mutable=False, description=f"Name of the bank ({bank.capitalize()})"),
            FieldDefinition(name="bank_address", is_mutable=False, description="Bank address"),
            FieldDefinition(name="table_headers", is_mutable=False, description="Headers for transaction table"),
            FieldDefinition(name="customer_service", is_mutable=False, description="Customer service contact information"),
            FieldDefinition(name="footnotes", is_mutable=False, description="Footnotes and disclosures")
        ] if bank == "citibank" else [
            FieldDefinition(name="account_holder", is_mutable=True, description="Name of the account holder"),
            FieldDefinition(name="account_holder_address", is_mutable=True, description="Address of the account holder"),
            FieldDefinition(name="account_number", is_mutable=True, description="Account number"),
            FieldDefinition(name="statement_period", is_mutable=True, description="Statement date range"),
            FieldDefinition(name="summary", is_mutable=True, description="Checking summary data (beginning balance, counts, totals)"),
            FieldDefinition(name="deposits", is_mutable=True, description="List of deposit transactions"),
            FieldDefinition(name="withdrawals", is_mutable=True, description="List of withdrawal transactions"),
            FieldDefinition(name="daily_balances", is_mutable=True, description="Daily ending balances"),
            FieldDefinition(name="logo_path", is_mutable=True, description="Path to the bank logo"),
            FieldDefinition(name="bank_name", is_mutable=False, description=f"Name of the bank ({bank.capitalize()})"),
            FieldDefinition(name="bank_address", is_mutable=False, description="Bank address"),
            FieldDefinition(name="checking_summary_header", is_mutable=False, description="Header for checking summary"),
            FieldDefinition(name="deposits_header", is_mutable=False, description="Header for deposits section"),
            FieldDefinition(name="withdrawals_header", is_mutable=False, description="Header for withdrawals section"),
            FieldDefinition(name="daily_balance_header", is_mutable=False, description="Header for daily balance section"),
            FieldDefinition(name="customer_service", is_mutable=False, description="Customer service contact information"),
            FieldDefinition(name="footnotes", is_mutable=False, description="Footnotes and disclosures")
        ]
        statement_fields = StatementFields(fields=default_fields)
    
    log_path = os.path.join(SYNTHETIC_STAT_DIR, f"template_fields_{bank}.json")
    with open(log_path, 'w', encoding='utf-8') as f:
        json.dump(statement_fields.model_dump(), f, indent=2)
    
    return statement_fields

In [8]:
# Cell 8: Generate populated HTML, PDF, and CSV
def generate_populated_html_and_pdf(df: pd.DataFrame, account_holder: str, bank: str, template_dir: str, output_dir: str, account_type: str) -> list:
    import os
    from jinja2 import Environment, FileSystemLoader
    import pdfkit
    import pandas as pd

    if bank not in BANK_CONFIG:
        raise ValueError(f"Unsupported bank: {bank}. Supported banks: {list(BANK_CONFIG.keys())}")

    # Create subdirectories for HTML, PDF, and CSV
    html_dir = os.path.join(output_dir, "HTML")
    pdf_dir = os.path.join(output_dir, "PDF")
    csv_dir = os.path.join(output_dir, "CSV")
    os.makedirs(html_dir, exist_ok=True)
    os.makedirs(pdf_dir, exist_ok=True)
    os.makedirs(csv_dir, exist_ok=True)

    # Get specific account name from BANK_CONFIG
    specific_account_name = BANK_CONFIG[bank]["account_types"][account_type]

    env = Environment(loader=FileSystemLoader(template_dir))

    initial_balance = round(random.uniform(1000, 20000), 2)
    deposits_total = sum(x for x in df['Amount'] if x > 0)
    withdrawals_total = abs(sum(x for x in df['Amount'] if x < 0))
    ending_balance = initial_balance + deposits_total - withdrawals_total
    service_fee = 25 if ending_balance < 5000 else 0
    if service_fee:
        withdrawals_total += service_fee
        ending_balance -= service_fee

    transactions_count = len(df)

    min_date = datetime.strptime(min(df['Date']), "%m/%d").replace(year=2025).strftime("%B %d")
    max_date = datetime.strptime(max(df['Date']), "%m/%d").replace(year=2025).strftime("%B %d")
    statement_date = datetime.now().strftime("%B %d, %Y at %I:%M %p %Z")

    address = fake.address().replace('\n', '<br>')[:100]
    account_holder = account_holder[:50]
    account_number = fake.bban()[:15]

    logo_path = os.path.join(SAMPLE_LOGOS_DIR, BANK_CONFIG[bank]["logo"])
    logo_data = ""
    if os.path.exists(logo_path):
        with open(logo_path, "rb") as img_file:
            logo_data = f"data:image/png;base64,{base64.b64encode(img_file.read()).decode('utf-8')}"

    if bank == "citibank":
        transactions = []
        total_debit = abs(sum(x for x in df['Amount'] if x < 0))
        total_credit = sum(x for x in df['Amount'] if x > 0)
        running_balance = initial_balance
        for _, row in df.iterrows():
            amount = row['Amount']
            debit = f"£{abs(amount):,.2f}" if amount < 0 else ""
            credit = f"£{amount:,.2f}" if amount > 0 else ""
            running_balance += amount
            transactions.append({
                "date": row["Date"],
                "description": row["Description"],
                "debit": debit,
                "credit": credit,
                "balance": f"£{running_balance:,.2f}"
            })

        template_data = {
            "account_holder": account_holder,
            "client_number": fake.uuid4()[:8],
            "date_of_birth": fake.date_of_birth(minimum_age=18, maximum_age=80).strftime("%m/%d/%Y"),
            "customer_account_number": account_number,
            "customer_iban": f"GB{fake.random_number(digits=2)}CITI{fake.random_number(digits=14)}",
            "customer_bank_name": "Citibank",
            "statement_period": f"{min_date} through {max_date}",
            "statement_date": statement_date,
            "opening_balance_debit": "",
            "opening_balance_credit": "",
            "opening_balance": f"£{initial_balance:,.2f}",
            "transactions": transactions,
            "total_debit": f"£{total_debit:,.2f}",
            "total_credit": f"£{total_credit:,.2f}",
            "total": f"£{ending_balance:,.2f}",
            "logo_path": logo_data,
            "account_type": specific_account_name,
            "show_fee_waiver": ending_balance >= 5000
        }
    elif bank == "wellsfargo":
        transactions = []
        running_balance = initial_balance
        for _, row in df.iterrows():
            amount = row['Amount']
            credit = f"${amount:,.2f}" if amount > 0 else ""
            debit = f"${abs(amount):,.2f}" if amount < 0 else ""
            running_balance += amount
            transactions.append({
                "date": row["Date"],
                "description": row["Description"],
                "credit": credit,
                "debit": debit,
                "balance": f"${running_balance:,.2f}",
                "type": "deposit" if amount > 0 else "withdrawal"
            })

        daily_balances = [
            {"date": row["Date"], "amount": f"${row['Balance']:,.2f}"}
            for _, row in df.iterrows()
        ]

        template_data = {
            "account_holder": account_holder,
            "account_holder_address": address,
            "account_number": account_number,
            "statement_period": f"{min_date}, 2025 – {max_date}, 2025",
            "statement_date": statement_date,
            "logo_path": logo_data,
            "summary": {
                "beginning_balance": f"${initial_balance:,.2f}",
                "deposits_total": f"${deposits_total:,.2f}",
                "withdrawals_total": f"${withdrawals_total:,.2f}",
                "ending_balance": f"${ending_balance:,.2f}",
                "deposits_count": sum(1 for x in df['Amount'] if x > 0),
                "withdrawals_count": sum(1 for x in df['Amount'] if x < 0) + (1 if service_fee else 0),
                "transactions_count": transactions_count,
                # New synthetic data for PNC-specific fields
                "average_balance": f"${round((initial_balance + ending_balance) / 2, 2):,.2f}",  # Average of beginning and ending balance
                "fees": f"${service_fee:,.2f}",  # Use service_fee as fees
                "checks_written": random.randint(0, 5),  # Random 0-5 checks
                "pos_transactions": random.randint(0, 10),  # Random 0-10 POS transactions
                "pos_pin_transactions": random.randint(0, 5),  # Random 0-5 PIN transactions
                "total_atm_transactions": random.randint(0, 8),  # Random 0-8 ATM transactions
                "pnc_atm_transactions": random.randint(0, 5),  # Random 0-5 PNC ATM transactions
                "other_atm_transactions": random.randint(0, 3),  # Random 0-3 other ATM transactions
                "apy_earned": f"{random.uniform(0.01, 0.5):.2f}%",  # Random 0.01%-0.5% APY
                "days_in_period": random.randint(28, 31),  # Random 28-31 days
                "average_collected_balance": f"${round(random.uniform(initial_balance, ending_balance), 2):,.2f}",  # Random between initial and ending balance
                "interest_paid_period": f"${random.uniform(0.1, 10):,.2f}",  # Random $0.10-$10 interest
                "interest_paid_ytd": f"${random.uniform(1, 50):,.2f}",  # Random $1-$50 YTD interest
                "overdraft_protection1": "PNC Savings Account XXXX1234" if random.choice([True, False]) else "",
                "overdraft_protection2": "PNC Credit Line XXXX5678" if random.choice([True, False]) else "",
                "overdraft_status": "Opted-In" if random.choice([True, False]) else "Opted-Out"
            },
            "transactions": transactions,
            "daily_balances": daily_balances,
            "account_type": specific_account_name,
            "show_fee_waiver": ending_balance >= 5000
        }
    elif bank == "pnc":
        transactions = []
        running_balance = initial_balance
        for _, row in df.iterrows():
            amount = row['Amount']
            transactions.append({
                "date": row["Date"],
                "description": row["Description"],
                "amount": f"${abs(amount):,.2f}" if amount < 0 else f"${amount:,.2f}",
                "type": "deposit" if amount > 0 else "withdrawal"
            })
            running_balance += amount

        daily_balances = [
            {"date1": df.iloc[i]["Date"], "bal1": f"${df.iloc[i]['Balance']:.2f}", "date2": "", "bal2": "", "date3": "", "bal3": ""}
            for i in range(0, len(df), 3)
        ] + [
            {"date1": df.iloc[i]["Date"], "bal1": f"${df.iloc[i]['Balance']:.2f}", "date2": df.iloc[i+1]["Date"] if i+1 < len(df) else "", "bal2": f"${df.iloc[i+1]['Balance']:.2f}" if i+1 < len(df) else "", "date3": "", "bal3": ""}
            for i in range(len(df)//3 * 3, len(df)-1, 2)
            if len(df) > len(df)//3 * 3
        ]

        template_data = {
            "account_holder": account_holder,
            "account_holder_address": address,
            "account_number": account_number,
            "statement_period": f"{min_date}, 2025 – {max_date}, 2025",
            "statement_date": statement_date,
            "logo_path": logo_data,
            "summary": {
                "beginning_balance": f"${initial_balance:,.2f}",
                "deposits_total": f"${deposits_total:,.2f}",
                "withdrawals_total": f"${withdrawals_total:,.2f}",
                "ending_balance": f"${ending_balance:,.2f}",
                "deposits_count": sum(1 for x in df['Amount'] if x > 0),
                "withdrawals_count": sum(1 for x in df['Amount'] if x < 0) + (1 if service_fee else 0),
                "transactions_count": transactions_count,
                # New synthetic data for PNC-specific fields
                "average_balance": f"${round((initial_balance + ending_balance) / 2, 2):,.2f}",  # Average of beginning and ending balance
                "fees": f"${service_fee:,.2f}",  # Use service_fee as fees
                "checks_written": random.randint(0, 5),  # Random 0-5 checks
                "pos_transactions": random.randint(0, 10),  # Random 0-10 POS transactions
                "pos_pin_transactions": random.randint(0, 5),  # Random 0-5 PIN transactions
                "total_atm_transactions": random.randint(0, 8),  # Random 0-8 ATM transactions
                "pnc_atm_transactions": random.randint(0, 5),  # Random 0-5 PNC ATM transactions
                "other_atm_transactions": random.randint(0, 3),  # Random 0-3 other ATM transactions
                "apy_earned": f"{random.uniform(0.01, 0.5):.2f}%",  # Random 0.01%-0.5% APY
                "days_in_period": random.randint(28, 31),  # Random 28-31 days
                "average_collected_balance": f"${round(random.uniform(initial_balance, ending_balance), 2):,.2f}",  # Random between initial and ending balance
                "interest_paid_period": f"${random.uniform(0.1, 10):,.2f}",  # Random $0.10-$10 interest
                "interest_paid_ytd": f"${random.uniform(1, 50):,.2f}",  # Random $1-$50 YTD interest
                "overdraft_protection1": "PNC Savings Account XXXX1234" if random.choice([True, False]) else "",
                "overdraft_protection2": "PNC Credit Line XXXX5678" if random.choice([True, False]) else "",
                "overdraft_status": "Opted-In" if random.choice([True, False]) else "Opted-Out"
            },
            "transactions": transactions,
            "daily_balances": daily_balances,
            "account_type": specific_account_name,
            "show_fee_waiver": ending_balance >= 5000
        }
    else:  # Chase
        deposits = [
            {"date": row["Date"], "description": row["Description"], "amount": f"${row['Amount']:,.2f}"}
            for _, row in df.iterrows() if row['Amount'] > 0
        ]
        withdrawals = [
            {"date": row["Date"], "description": row["Description"], "amount": f"${abs(row['Amount']):,.2f}"}
            for _, row in df.iterrows() if row['Amount'] < 0
        ]
        daily_balances = []
        balance_dict = {}
        for _, row in df.iterrows():
            date = row["Date"]
            balance_dict[date] = row["Balance"]
        for date in sorted(balance_dict.keys()):
            daily_balances.append({"date": date, "amount": balance_dict[date]})

        template_data = {
            "account_holder": account_holder,
            "account_holder_address": address,
            "account_number": account_number,
            "statement_period": f"{min_date} through {max_date}",
            "statement_date": statement_date,
            "logo_path": logo_data,
            "summary": {
                "beginning_balance": f"${initial_balance:,.2f}",
                "deposits_count": len(deposits),
                "deposits_total": f"${deposits_total:,.2f}",
                "withdrawals_count": len(withdrawals) + (1 if service_fee else 0),
                "withdrawals_total": f"${withdrawals_total:,.2f}",
                "ending_balance": f"${ending_balance:,.2f}",
                "transactions_count": transactions_count,
                # New synthetic data for Chase-specific fields (if needed)
                "average_balance": f"${round((initial_balance + ending_balance) / 2, 2):,.2f}",
                "fees": f"${service_fee:,.2f}",
                "checks_written": random.randint(0, 5),
                "pos_transactions": random.randint(0, 10),
                "pos_pin_transactions": random.randint(0, 5),
                "total_atm_transactions": random.randint(0, 8),
                "pnc_atm_transactions": random.randint(0, 5),
                "other_atm_transactions": random.randint(0, 3),
                "apy_earned": f"{random.uniform(0.01, 0.5):.2f}%",
                "days_in_period": random.randint(28, 31),
                "average_collected_balance": f"${round(random.uniform(initial_balance, ending_balance), 2):,.2f}",
                "interest_paid_period": f"${random.uniform(0.1, 10):,.2f}",
                "interest_paid_ytd": f"${random.uniform(1, 50):,.2f}",
                "overdraft_protection1": "Chase Savings Account XXXX1234" if random.choice([True, False]) else "",
                "overdraft_protection2": "Chase Credit Line XXXX5678" if random.choice([True, False]) else "",
                "overdraft_status": "Opted-In" if random.choice([True, False]) else "Opted-Out"
            },
            "deposits": deposits,
            "withdrawals": withdrawals,
            "daily_balances": daily_balances,
            "account_type": specific_account_name,
            "show_fee_waiver": ending_balance >= 5000
        }

    results = []
    for template_file in BANK_CONFIG[bank]["templates"]:
        if not os.path.exists(os.path.join(template_dir, template_file)):
            raise FileNotFoundError(f"Template {template_file} not found in {template_dir}")

        template = env.get_template(template_file)
        template_name = os.path.splitext(template_file)[0]
        safe_account_holder = ''.join(c for c in account_holder if c.isalnum() or c in (' ', '_')).replace(' ', '_')
        safe_account_name = specific_account_name.replace(' ', '_')
        html_filename = os.path.join(html_dir, f"bank_statement_{safe_account_name}_{safe_account_holder}_{bank}_{template_name}.html")
        pdf_filename = os.path.join(pdf_dir, f"bank_statement_{safe_account_name}_{safe_account_holder}_{bank}_{template_name}.pdf")
        csv_filename = os.path.join(csv_dir, f"bank_statement_{safe_account_name}_{safe_account_holder}_{bank}.csv")

        rendered_html = template.render(**template_data)

        with open(html_filename, 'w', encoding='utf-8') as f:
            f.write(rendered_html)

        wkhtmltopdf_path = "C:\\Program Files\\wkhtmltopdf\\bin\\wkhtmltopdf.exe"  # Adjust if needed
        config = pdfkit.configuration(wkhtmltopdf=wkhtmltopdf_path)
        options = {
            "enable-local-file-access": "",
            "page-size": "Letter",
            "margin-top": "0.8in",
            "margin-right": "0.9in",
            "margin-bottom": "0.8in",
            "margin-left": "0.9in",
            "encoding": "UTF-8",
            "disable-javascript": "",
            "image-dpi": "300",
            "enable-forms": "",
            "no-outline": "",
            "print-media-type": ""
        }
        try:
            pdfkit.from_string(rendered_html, pdf_filename, configuration=config, options=options)
        except OSError as e:
            print(f"PDF generation failed for {bank} template {template_file}: {e}. Ensure wkhtmltopdf is installed and accessible.")

        df.to_csv(csv_filename, index=False, encoding='utf-8')

        results.append((html_filename, pdf_filename, csv_filename))

    return results

In [9]:
# Cell 9: Main execution
if __name__ == "__main__":
    import random
    
    num_transactions = random.randint(3, 12)
    account_holder = fake.company().upper() if random.choice([True, False]) else fake.name().upper()
    template_dir = TEMPLATES_DIR
    output_dir = SYNTHETIC_STAT_DIR
    banks = ["chase", "citibank", "wellsfargo", "pnc"]
    account_types = ["business", "personal"]
    
    for account_type in account_types:
        for bank in banks:
            specific_account_name = BANK_CONFIG[bank]["account_types"][account_type]
            print(f"\nGenerating {specific_account_name} statement for {bank.capitalize()}")
            df = generate_bank_statement(num_transactions, account_holder, account_type)
            
            statement_fields = identify_template_fields(bank, template_dir)
            print(f"Identified Template Fields for {bank.capitalize()}:")
            for field in statement_fields.fields:
                print(f"- {field.name}: {'Mutable' if field.is_mutable else 'Immutable'}, {field.description}")
            
            results = generate_populated_html_and_pdf(df, account_holder, bank, template_dir, output_dir, account_type)
            for html_file, pdf_file, csv_file in results:
                print(f"HTML saved as: {html_file}")
                print(f"PDF saved as: {pdf_file}")
                print(f"CSV saved as: {csv_file}")


Generating Chase Business Complete Checking statement for Chase
Identified Template Fields for Chase:
- account_type: Immutable, Type of account (e.g., checking, savings)
- logo_path: Mutable, Path to bank logo
- statement_period: Mutable, Time period covered by the statement (e.g., April 2022 - March 2023)
- account_number: Immutable, Unique number assigned to the account by Chase Bank
- account_holder: Mutable, Name of the person or entity that holds the account
- account_holder_address: Mutable, Address of the account holder as it appears on Chase bank records
- summary.beginning_balance: Immutable, Balance at the start of the statement period
- summary.deposits_count: Immutable, Number of deposits made during the statement period
- summary.deposits_total: Mutable, Total amount deposited during the statement period
- summary.withdrawals_count: Immutable, Number of withdrawals made during the statement period
- summary.withdrawals_total: Mutable, Total amount withdrawn during the st

and fails to parse leap day. The default behavior will change in Python 3.15
to either always raise an exception or to use a different default year (TBD).
To avoid trouble, add a specific year to the input & format.
See https://github.com/python/cpython/issues/70647.
  min_date = datetime.strptime(min(df['Date']), "%m/%d").replace(year=2025).strftime("%B %d")


HTML saved as: new_statements\HTML\bank_statement_Chase_Business_Complete_Checking_TIMOTHY_MARTINEZ_chase_chase_classic_style.html
PDF saved as: new_statements\PDF\bank_statement_Chase_Business_Complete_Checking_TIMOTHY_MARTINEZ_chase_chase_classic_style.pdf
CSV saved as: new_statements\CSV\bank_statement_Chase_Business_Complete_Checking_TIMOTHY_MARTINEZ_chase.csv
HTML saved as: new_statements\HTML\bank_statement_Chase_Business_Complete_Checking_TIMOTHY_MARTINEZ_chase_chase_variation_1.html
PDF saved as: new_statements\PDF\bank_statement_Chase_Business_Complete_Checking_TIMOTHY_MARTINEZ_chase_chase_variation_1.pdf
CSV saved as: new_statements\CSV\bank_statement_Chase_Business_Complete_Checking_TIMOTHY_MARTINEZ_chase.csv
HTML saved as: new_statements\HTML\bank_statement_Chase_Business_Complete_Checking_TIMOTHY_MARTINEZ_chase_chase_variation_2.html
PDF saved as: new_statements\PDF\bank_statement_Chase_Business_Complete_Checking_TIMOTHY_MARTINEZ_chase_chase_variation_2.pdf
CSV saved as: n