In [47]:
# Cell 1: Import libraries
import os
import json
import re
import base64  # New import for base64 encoding
from faker import Faker
from datetime import datetime, timedelta
import random
import pandas as pd
import ollama
from pydantic import BaseModel, Field
from typing import List, Dict
from jinja2 import Environment, FileSystemLoader
import pdfkit


# Initialize Faker
fake = Faker()


In [48]:

# Cell 2: Directory setup and bank configuration
SAMPLE_STATEMENT_DIR = "sample_statements"
SAMPLE_LOGOS_DIR = "sample_logos"
SYNTHETIC_STAT_DIR = "synthetic_statements"
TEMPLATES_DIR = "templates"

# Create directories if they don’t exist
for directory in [SAMPLE_STATEMENT_DIR, SAMPLE_LOGOS_DIR, SYNTHETIC_STAT_DIR, TEMPLATES_DIR]:
    os.makedirs(directory, exist_ok=True)

# Bank configuration: map bank names to their logos and templates
BANK_CONFIG = {
    "chase": {
        "logo": "chase_bank_logo.png",
        "template": "chase_mail_style.html"
    },
    "citibank": {
        "logo": "citibank_logo.png",
        "template": "citibank_template.html"
    }
}

# Validate bank configuration
for bank, config in BANK_CONFIG.items():
    logo_path = os.path.join(SAMPLE_LOGOS_DIR, config["logo"])
    template_path = os.path.join(TEMPLATES_DIR, config["template"])
    if not os.path.exists(logo_path):
        raise FileNotFoundError(f"Logo file not found for {bank}: {logo_path}")
    if not os.path.exists(template_path):
        raise FileNotFoundError(f"Template file not found for {bank}: {template_path}")

In [49]:

# Cell 3: Pydantic models
class FieldDefinition(BaseModel):
    name: str = Field(..., description="Field name (e.g., account_holder, account_number)")
    is_mutable: bool = Field(..., description="Whether the field is mutable")
    description: str = Field(..., description="Description of the field")

class StatementFields(BaseModel):
    fields: List[FieldDefinition] = Field(..., description="List of mutable and immutable fields")

class Transaction(BaseModel):
    description: str = Field(..., max_length=35, description="Transaction description")
    category: str
    amount: float


In [50]:

# Cell 4: Generate category lists
def generate_category_lists() -> tuple[List[str], List[str]]:
    prompt = """
    Generate two lists of bank transaction categories in JSON format for Chase bank statements.
    One list for reasons someone loses money (e.g., utilities, subscriptions) and one for reasons someone gains money (e.g., deposits, refunds).
    Each list should have 5 unique categories, each 1-2 words, title case, no punctuation.
    Return:
    {
      "loss_categories": ["Category One", "Category Two", ...],
      "gain_categories": ["Category One", "Category Two", ...]
    }
    """
    try:
        response = ollama.generate(model="mistral:7b-instruct-v0.3-q4_0", prompt=prompt)
        category_data = json.loads(response['response'].strip())
        loss_categories = [cat for cat in category_data.get("loss_categories", []) if isinstance(cat, str) and 1 <= len(cat.split()) <= 2]
        gain_categories = [cat for cat in category_data.get("gain_categories", []) if isinstance(cat, str) and 1 <= len(cat.split()) <= 2]
        if len(loss_categories) < 5 or len(gain_categories) < 5:
            raise ValueError("Insufficient valid categories")
    except (json.JSONDecodeError, ValueError):
        loss_categories = ["Utility Payment", "Subscription Fee", "Online Purchase", "Rent Payment", "Grocery Shopping"]
        gain_categories = ["Salary Deposit", "Tax Refund", "Gift Received", "Client Payment", "Cash Deposit"]
    return loss_categories, gain_categories


In [51]:

# Cell 5: Generate transaction description
def generate_transaction_description(amount: float, category: str) -> dict:
    prompt = f"""
    Generate a bank transaction description (3-5 words, max 45 characters) for a bank transaction in the '{category}' category.
    Rules:
    - Use title case.
    - No punctuation.
    - No parentheses, dashes, or dollar signs.
    - No amounts or numbers as words.
    - Use simple phrases.
    - Examples: 'Grocery Store Purchase', 'Utility Bill Payment'
    """
    try:
        response = ollama.generate(model="mistral:7b-instruct-v0.3-q4_0", prompt=prompt)
        description = response['response'].strip()[:25]
    except:
        description = f"{category} Transaction"
    description = description.replace("(", "").replace(")", "").replace(",", "").replace(":", "").replace("-", "").replace("$", "").replace(".", "")
    description = ' '.join(word.capitalize() for word in description.split())[:25]
    words = description.split()
    if len(words) < 3 or len(words) > 5:
        description = f"{category} Transaction"[:45]
    transaction = Transaction(description=description, category=category, amount=amount)
    return transaction.dict()


In [52]:

# Cell 6: Generate synthetic bank statement
def generate_bank_statement(num_transactions: int, account_holder: str) -> pd.DataFrame:
    loss_categories, gain_categories = generate_category_lists()
    start_date = datetime.now() - timedelta(days=30)
    dates = [start_date + timedelta(days=random.randint(0, 30)) for _ in range(num_transactions)]
    transactions = []
    for _ in range(num_transactions):
        is_gain = random.choice([True, False])
        category = random.choice(gain_categories if is_gain else loss_categories)
        amount = round(random.uniform(50, 1000), 2) if is_gain else round(random.uniform(-500, -10), 2)
        transaction = generate_transaction_description(amount, category)
        transactions.append(transaction)
    data = {
        "Date": [d.strftime("%m/%d") for d in dates],
        "Description": [t["description"] for t in transactions],
        "Category": [t["category"] for t in transactions],
        "Amount": [t["amount"] for t in transactions],
        "Balance": [0.0] * num_transactions,
        "Account Holder": [account_holder] * num_transactions,
        "Transaction ID": [(fake.bban()[:10] + str(i).zfill(4)) for i in range(num_transactions)]
    }
    df = pd.DataFrame(data)
    df = df.sort_values("Date")
    initial_balance = round(random.uniform(1000, 20000), 2)
    df["Balance"] = initial_balance + df["Amount"].cumsum()
    return df


In [53]:
# Cell 7: Identify mutable and immutable fields using Ollama
def identify_template_fields(bank: str, templates_dir: str = TEMPLATES_DIR) -> StatementFields:
    if bank not in BANK_CONFIG:
        raise ValueError(f"Unsupported bank: {bank}. Supported banks: {list(BANK_CONFIG.keys())}")
    
    template_path = os.path.join(templates_dir, BANK_CONFIG[bank]["template"])
    if not os.path.exists(template_path):
        raise FileNotFoundError(f"Template file not found: {template_path}")
    
    with open(template_path, 'r', encoding='utf-8') as f:
        template_content = f.read()
    
    placeholders = re.findall(r'\{\{([^{}]+)\}\}', template_content)
    placeholders = [p.strip() for p in placeholders]
    
    prompt = f"""
    Given the following list of placeholders extracted from a {bank.capitalize()} bank statement HTML template, classify each as mutable (can be changed with synthetic data) or immutable (static, e.g., bank details or table headers). Return a JSON object with a list of fields, each containing the field name, whether it is mutable (true/false), and a brief description of what the field represents. Example:
    {{
        "fields": [
            {{"name": "account_holder", "is_mutable": true, "description": "Name of the account holder"}},
            {{"name": "bank_name", "is_mutable": false, "description": "Name of the bank"}}
        ]
    }}
    Placeholders:
    {', '.join(placeholders)}
    Rules:
    - Mutable fields include account holder, client number, date of birth, account number, IBAN, statement period, statement date, transactions, opening balance, total debit, total credit, total balance, and other customer-specific data.
    - Immutable fields include bank name, bank address, table headers, customer service information, and footnotes.
    - Ensure descriptions are relevant to {bank.capitalize()} bank statements.
    """
    try:
        response = ollama.generate(model="mistral:7b-instruct-v0.3-q4_0", prompt=prompt)
        fields_data = json.loads(response['response'].strip())
        statement_fields = StatementFields(**fields_data)
    except (json.JSONDecodeError, ValueError, ollama.RequestError) as e:
        print(f"Ollama failed for {bank}: {e}. Using fallback fields.")
        default_fields = [
            FieldDefinition(name="account_holder", is_mutable=True, description="Name of the account holder"),
            FieldDefinition(name="client_number", is_mutable=True, description="Client identification number"),
            FieldDefinition(name="date_of_birth", is_mutable=True, description="Customer's date of birth"),
            FieldDefinition(name="customer_account_number", is_mutable=True, description="Customer's account number"),
            FieldDefinition(name="customer_iban", is_mutable=True, description="Customer's IBAN"),
            FieldDefinition(name="customer_bank_name", is_mutable=True, description="Name of the customer's bank"),
            FieldDefinition(name="statement_period", is_mutable=True, description="Statement date range"),
            FieldDefinition(name="statement_date", is_mutable=True, description="Date the statement was created"),
            FieldDefinition(name="opening_balance_debit", is_mutable=True, description="Opening balance debit amount"),
            FieldDefinition(name="opening_balance_credit", is_mutable=True, description="Opening balance credit amount"),
            FieldDefinition(name="opening_balance", is_mutable=True, description="Opening balance total"),
            FieldDefinition(name="transactions", is_mutable=True, description="List of transaction details"),
            FieldDefinition(name="total_debit", is_mutable=True, description="Total debit amount"),
            FieldDefinition(name="total_credit", is_mutable=True, description="Total credit amount"),
            FieldDefinition(name="total", is_mutable=True, description="Total balance"),
            FieldDefinition(name="bank_name", is_mutable=False, description=f"Name of the bank ({bank.capitalize()})"),
            FieldDefinition(name="bank_address", is_mutable=False, description="Bank address"),
            FieldDefinition(name="table_headers", is_mutable=False, description="Headers for transaction table"),
            FieldDefinition(name="customer_service", is_mutable=False, description="Customer service contact information"),
            FieldDefinition(name="footnotes", is_mutable=False, description="Footnotes and disclosures")
        ] if bank == "citibank" else [
            FieldDefinition(name="account_holder", is_mutable=True, description="Name of the account holder"),
            FieldDefinition(name="account_holder_address", is_mutable=True, description="Address of the account holder"),
            FieldDefinition(name="account_number", is_mutable=True, description="Account number"),
            FieldDefinition(name="statement_period", is_mutable=True, description="Statement date range"),
            FieldDefinition(name="summary", is_mutable=True, description="Checking summary data (beginning balance, counts, totals)"),
            FieldDefinition(name="deposits", is_mutable=True, description="List of deposit transactions"),
            FieldDefinition(name="withdrawals", is_mutable=True, description="List of withdrawal transactions"),
            FieldDefinition(name="daily_balances", is_mutable=True, description="Daily ending balances"),
            FieldDefinition(name="logo_path", is_mutable=True, description="Path to the bank logo"),
            FieldDefinition(name="bank_name", is_mutable=False, description=f"Name of the bank ({bank.capitalize()})"),
            FieldDefinition(name="bank_address", is_mutable=False, description="Bank address"),
            FieldDefinition(name="checking_summary_header", is_mutable=False, description="Header for checking summary"),
            FieldDefinition(name="deposits_header", is_mutable=False, description="Header for deposits section"),
            FieldDefinition(name="withdrawals_header", is_mutable=False, description="Header for withdrawals section"),
            FieldDefinition(name="daily_balance_header", is_mutable=False, description="Header for daily balance section"),
            FieldDefinition(name="customer_service", is_mutable=False, description="Customer service contact information"),
            FieldDefinition(name="footnotes", is_mutable=False, description="Footnotes and disclosures")
        ]
        statement_fields = StatementFields(fields=default_fields)
    
    log_path = os.path.join(SYNTHETIC_STAT_DIR, f"template_fields_{bank}.json")
    with open(log_path, 'w', encoding='utf-8') as f:
        json.dump(statement_fields.model_dump(), f, indent=2)
    
    return statement_fields

In [54]:
# Cell 8: Generate populated HTML and PDF
def generate_populated_html_and_pdf(df: pd.DataFrame, account_holder: str, bank: str, template_dir: str, output_dir: str) -> list:
    if bank not in BANK_CONFIG:
        raise ValueError(f"Unsupported bank: {bank}. Supported banks: {list(BANK_CONFIG.keys())}")
    
    # Initialize environment for templates
    env = Environment(loader=FileSystemLoader(template_dir))
    
    # Generate initial data
    initial_balance = round(random.uniform(1000, 20000), 2)
    deposits_total = sum(x for x in df['Amount'] if x > 0)
    withdrawals_total = abs(sum(x for x in df['Amount'] if x < 0))
    ending_balance = initial_balance + deposits_total - withdrawals_total
    service_fee = 25 if ending_balance < 5000 else 0
    if service_fee:
        withdrawals_total += service_fee
        ending_balance -= service_fee
    
    transactions_count = len(df)
    
    min_date = datetime.strptime(min(df['Date']), "%m/%d").replace(year=2025).strftime("%B %d, %Y")
    max_date = datetime.strptime(max(df['Date']), "%m/%d").replace(year=2025).strftime("%B %d, %Y")
    statement_date = datetime.now().strftime("%B %d, %Y")
    
    # Common data
    address = fake.address().replace('\n', '<br>')[:100]
    account_holder = account_holder[:50]
    account_number = fake.bban()[:15]
    
    # Embed logo as base64
    logo_path = os.path.join(SAMPLE_LOGOS_DIR, BANK_CONFIG[bank]["logo"])
    logo_data = ""
    if os.path.exists(logo_path):
        with open(logo_path, "rb") as img_file:
            logo_data = f"data:image/png;base64,{base64.b64encode(img_file.read()).decode('utf-8')}"
    
    # Bank-specific template data
    if bank == "citibank":
        transactions = []
        total_debit = abs(sum(x for x in df['Amount'] if x < 0))
        total_credit = sum(x for x in df['Amount'] if x > 0)
        running_balance = initial_balance
        for _, row in df.iterrows():
            amount = row['Amount']
            debit = f"${abs(amount):,.2f}" if amount < 0 else ""
            credit = f"${amount:,.2f}" if amount > 0 else ""
            running_balance += amount
            transactions.append({
                "date": row["Date"],
                "description": row["Description"],
                "debit": debit,
                "credit": credit,
                "balance": f"${running_balance:,.2f}"
            })
        
        template_data = {
            "account_holder": account_holder,
            "client_number": fake.uuid4()[:8],
            "date_of_birth": fake.date_of_birth(minimum_age=18, maximum_age=80).strftime("%m/%d/%Y"),
            "customer_account_number": account_number,
            "customer_iban": f"GB{fake.random_number(digits=2)}CITI{fake.random_number(digits=14)}",
            "customer_bank_name": "Citibank",
            "statement_period": f"{min_date} through {max_date}",
            "statement_date": statement_date,
            "opening_balance_debit": "",
            "opening_balance_credit": "",
            "opening_balance": f"${initial_balance:,.2f}",
            "transactions": transactions,
            "total_debit": f"${total_debit:,.2f}",
            "total_credit": f"${total_credit:,.2f}",
            "total": f"${ending_balance:,.2f}",
            "logo_path": logo_data
        }
    else:  # Chase
        deposits = [
            {"date": row["Date"], "description": row["Description"], "amount": f"${row['Amount']:,.2f}"}
            for _, row in df.iterrows() if row['Amount'] > 0
        ]
        withdrawals = [
            {"date": row["Date"], "description": row["Description"], "amount": f"${abs(row['Amount']):,.2f}"}
            for _, row in df.iterrows() if row['Amount'] < 0
        ]
        daily_balances = []
        balance_dict = {}
        for _, row in df.iterrows():
            date = row["Date"]
            balance_dict[date] = row["Balance"]
        for date in sorted(balance_dict.keys()):
            daily_balances.append({"date": date, "amount": balance_dict[date]})
        
        template_data = {
            "account_holder": account_holder,
            "account_holder_address": address,
            "account_number": account_number,
            "statement_period": f"{min_date} through {max_date}",
            "summary": {
                "beginning_balance": f"${initial_balance:,.2f}",
                "deposits_count": len(deposits),
                "deposits_total": f"${deposits_total:,.2f}",
                "withdrawals_count": len(withdrawals) + (1 if service_fee else 0),
                "withdrawals_total": f"${withdrawals_total:,.2f}",
                "ending_balance": f"${ending_balance:,.2f}",
                "transactions_count": transactions_count
            },
            "deposits": deposits,
            "withdrawals": withdrawals,
            "daily_balances": daily_balances,
            "logo_path": logo_data,
            "bank_name": bank.capitalize(),
            "show_fee_waiver": service_fee == 0
        }
    
    results = []
    template_file = BANK_CONFIG[bank]["template"]
    if not os.path.exists(os.path.join(template_dir, template_file)):
        raise FileNotFoundError(f"Template {template_file} not found in {template_dir}")
    
    template = env.get_template(template_file)
    html_filename = os.path.join(output_dir, f"bank_statement_{account_holder.replace(' ', '_')}_{bank}.html")
    pdf_filename = os.path.join(output_dir, f"bank_statement_{account_holder.replace(' ', '_')}_{bank}.pdf")
    
    rendered_html = template.render(**template_data)
    
    with open(html_filename, 'w', encoding='utf-8') as f:
        f.write(rendered_html)
    
    wkhtmltopdf_path = "C:\\Program Files\\wkhtmltopdf\\bin\\wkhtmltopdf.exe"  # Adjust if needed
    config = pdfkit.configuration(wkhtmltopdf=wkhtmltopdf_path)
    options = {
        "enable-local-file-access": "",
        "page-size": "Letter",
        "margin-top": "0.75in",
        "margin-right": "0.75in",
        "margin-bottom": "0.75in",
        "margin-left": "0.75in",
        "encoding": "UTF-8",
        "disable-javascript": "",
        "image-dpi": "300",
        "viewport-size": "1280x1024",
        "enable-smart-shrinking": "",
        "zoom": "1.0"
    }
    try:
        pdfkit.from_string(rendered_html, pdf_filename, configuration=config, options=options)
        results.append((html_filename, pdf_filename))
    except OSError as e:
        print(f"PDF generation failed for {bank} template: {e}. Ensure wkhtmltopdf is installed and accessible.")
    
    return results

In [55]:
# Cell 9: Main execution
if __name__ == "__main__":
    import random
    
    num_transactions = random.randint(3, 12)
    account_holder = fake.company().upper()
    template_dir = TEMPLATES_DIR
    output_dir = SYNTHETIC_STAT_DIR
    banks = ["chase", "citibank"]  # List of banks to generate statements for
    
    for bank in banks:
        print(f"\nGenerating statement for {bank.capitalize()}")
        df = generate_bank_statement(num_transactions, account_holder)
        csv_filename = os.path.join(SYNTHETIC_STAT_DIR, f"bank_statement_{account_holder.replace(' ', '_')}_{bank}.csv")
        df.to_csv(csv_filename, index=False)
        print(f"CSV saved as: {csv_filename}")
        
        statement_fields = identify_template_fields(bank, template_dir)
        print(f"Identified Template Fields for {bank.capitalize()}:")
        for field in statement_fields.fields:
            print(f"- {field.name}: {'Mutable' if field.is_mutable else 'Immutable'}, {field.description}")
        
        results = generate_populated_html_and_pdf(df, account_holder, bank, template_dir, output_dir)
        for html_file, pdf_file in results:
            print(f"HTML saved as: {html_file}")
            print(f"PDF saved as: {pdf_file}")


Generating statement for Chase


C:\Users\cha\AppData\Local\Temp\ipykernel_8632\477407964.py:24: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.11/migration/
  return transaction.dict()


CSV saved as: synthetic_statements\bank_statement_SPEARS_PLC_chase.csv
Identified Template Fields for Chase:
- logo_path: Immutable, Path to the logo image on the statement
- statement_period: Mutable, Period covered by the bank statement (e.g., January 1 - March 31)
- account_number: Immutable, Unique number identifying the customer's account with Chase Bank
- account_holder: Mutable, Name of the individual or entity holding the account at Chase Bank
- account_holder_address: Mutable, Address associated with the account holder at Chase Bank
- summary.beginning_balance: Immutable, The balance of the account at the start of the statement period
- summary.deposits_count: Immutable, Number of deposits made to the account during the statement period
- summary.deposits_total: Immutable, Total amount deposited into the account during the statement period
- summary.withdrawals_count: Immutable, Number of withdrawals made from the account during the statement period
- summary.withdrawals_total