In [30]:
# Cell 1: Import libraries
import os
import json
import re
import base64  # New import for base64 encoding
from faker import Faker
from datetime import datetime, timedelta
import random
import pandas as pd
import ollama
from pydantic import BaseModel, Field
from typing import List, Dict
from jinja2 import Environment, FileSystemLoader
import pdfkit


# Initialize Faker
fake = Faker()


In [31]:

# Cell 2: Directory setup
SAMPLE_STATEMENT_DIR = "sample_statements"
SAMPLE_LOGOS_DIR = "sample_logos"
SYNTHETIC_STAT_DIR = "synthetic_statements"
TEMPLATES_DIR = "templates"

# Create directories if they don’t exist
for directory in [SAMPLE_STATEMENT_DIR, SAMPLE_LOGOS_DIR, SYNTHETIC_STAT_DIR, TEMPLATES_DIR]:
    os.makedirs(directory, exist_ok=True)

# Chase-specific logo
BANK_LOGO = "chase_bank_logo.png"
BANK_NAME = "chase"


In [32]:

# Cell 3: Pydantic models
class FieldDefinition(BaseModel):
    name: str = Field(..., description="Field name (e.g., account_holder, account_number)")
    is_mutable: bool = Field(..., description="Whether the field is mutable")
    description: str = Field(..., description="Description of the field")

class StatementFields(BaseModel):
    fields: List[FieldDefinition] = Field(..., description="List of mutable and immutable fields")

class Transaction(BaseModel):
    description: str = Field(..., max_length=35, description="Transaction description")
    category: str
    amount: float


In [33]:

# Cell 4: Generate category lists
def generate_category_lists() -> tuple[List[str], List[str]]:
    prompt = """
    Generate two lists of bank transaction categories in JSON format for Chase bank statements.
    One list for reasons someone loses money (e.g., utilities, subscriptions) and one for reasons someone gains money (e.g., deposits, refunds).
    Each list should have 5 unique categories, each 1-2 words, title case, no punctuation.
    Return:
    {
      "loss_categories": ["Category One", "Category Two", ...],
      "gain_categories": ["Category One", "Category Two", ...]
    }
    """
    try:
        response = ollama.generate(model="mistral:7b-instruct-v0.3-q4_0", prompt=prompt)
        category_data = json.loads(response['response'].strip())
        loss_categories = [cat for cat in category_data.get("loss_categories", []) if isinstance(cat, str) and 1 <= len(cat.split()) <= 2]
        gain_categories = [cat for cat in category_data.get("gain_categories", []) if isinstance(cat, str) and 1 <= len(cat.split()) <= 2]
        if len(loss_categories) < 5 or len(gain_categories) < 5:
            raise ValueError("Insufficient valid categories")
    except (json.JSONDecodeError, ValueError):
        loss_categories = ["Utility Payment", "Subscription Fee", "Online Purchase", "Rent Payment", "Grocery Shopping"]
        gain_categories = ["Salary Deposit", "Tax Refund", "Gift Received", "Client Payment", "Cash Deposit"]
    return loss_categories, gain_categories


In [34]:

# Cell 5: Generate transaction description
def generate_transaction_description(amount: float, category: str) -> dict:
    prompt = f"""
    Generate a bank transaction description (3-5 words, max 45 characters) for a bank transaction in the '{category}' category.
    Rules:
    - Use title case.
    - No punctuation.
    - No parentheses, dashes, or dollar signs.
    - No amounts or numbers as words.
    - Use simple phrases.
    - Examples: 'Grocery Store Purchase', 'Utility Bill Payment'
    """
    try:
        response = ollama.generate(model="mistral:7b-instruct-v0.3-q4_0", prompt=prompt)
        description = response['response'].strip()[:25]
    except:
        description = f"{category} Transaction"
    description = description.replace("(", "").replace(")", "").replace(",", "").replace(":", "").replace("-", "").replace("$", "").replace(".", "")
    description = ' '.join(word.capitalize() for word in description.split())[:25]
    words = description.split()
    if len(words) < 3 or len(words) > 5:
        description = f"{category} Transaction"[:45]
    transaction = Transaction(description=description, category=category, amount=amount)
    return transaction.dict()


In [35]:

# Cell 6: Generate synthetic bank statement
def generate_bank_statement(num_transactions: int, account_holder: str) -> pd.DataFrame:
    loss_categories, gain_categories = generate_category_lists()
    start_date = datetime.now() - timedelta(days=30)
    dates = [start_date + timedelta(days=random.randint(0, 30)) for _ in range(num_transactions)]
    transactions = []
    for _ in range(num_transactions):
        is_gain = random.choice([True, False])
        category = random.choice(gain_categories if is_gain else loss_categories)
        amount = round(random.uniform(50, 1000), 2) if is_gain else round(random.uniform(-500, -10), 2)
        transaction = generate_transaction_description(amount, category)
        transactions.append(transaction)
    data = {
        "Date": [d.strftime("%m/%d") for d in dates],
        "Description": [t["description"] for t in transactions],
        "Category": [t["category"] for t in transactions],
        "Amount": [t["amount"] for t in transactions],
        "Balance": [0.0] * num_transactions,
        "Account Holder": [account_holder] * num_transactions,
        "Transaction ID": [(fake.bban()[:10] + str(i).zfill(4)) for i in range(num_transactions)]
    }
    df = pd.DataFrame(data)
    df = df.sort_values("Date")
    initial_balance = round(random.uniform(1000, 20000), 2)
    df["Balance"] = initial_balance + df["Amount"].cumsum()
    return df


In [36]:

# Cell 7: Identify mutable and immutable fields using Ollama
def identify_template_fields(template_path: str) -> StatementFields:
    if not os.path.exists(template_path):
        raise FileNotFoundError(f"Template file not found: {template_path}")
    
    with open(template_path, 'r') as f:
        template_content = f.read()
    
    placeholders = re.findall(r'\{\{([^{}]+)\}\}', template_content)
    placeholders = [p.strip() for p in placeholders]
    
    prompt = f"""
    Given the following list of placeholders extracted from a Chase bank statement HTML template, classify each as mutable (can be changed with synthetic data) or immutable (static, e.g., bank details or table headers). Return a JSON object with a list of fields, each containing the field name, whether it is mutable (true/false), and a brief description of what the field represents. Example:
    {{
        "fields": [
            {{"name": "account_holder", "is_mutable": true, "description": "Name of the account holder"}},
            {{"name": "bank_name", "is_mutable": false, "description": "Name of the bank (JPMorgan Chase)"}}
        ]
    }}
    Placeholders:
    {', '.join(placeholders)}
    Rules:
    - Mutable fields include account holder, account holder's address, account number, statement period, transactions, deposits, withdrawals, daily balances, and summary data (e.g., beginning balance, counts, totals).
    - Immutable fields include bank name, bank address, table headers, customer service information, and footnotes.
    """
    try:
        response = ollama.generate(model="mistral:7b-instruct-v0.3-q4_0", prompt=prompt)
        fields_data = json.loads(response['response'].strip())
        statement_fields = StatementFields(**fields_data)
    except (json.JSONDecodeError, ValueError, ollama.RequestError) as e:
        print(f"Ollama failed: {e}. Using fallback fields.")
        default_fields = [
            FieldDefinition(name="account_holder", is_mutable=True, description="Name of the account holder"),
            FieldDefinition(name="account_holder_address", is_mutable=True, description="Address of the account holder"),
            FieldDefinition(name="account_number", is_mutable=True, description="Account number"),
            FieldDefinition(name="statement_period", is_mutable=True, description="Statement date range"),
            FieldDefinition(name="summary", is_mutable=True, description="Checking summary data (beginning balance, counts, totals)"),
            FieldDefinition(name="deposits", is_mutable=True, description="List of deposit transactions"),
            FieldDefinition(name="withdrawals", is_mutable=True, description="List of withdrawal transactions"),
            FieldDefinition(name="daily_balances", is_mutable=True, description="Daily ending balances"),
            FieldDefinition(name="logo_path", is_mutable=True, description="Path to the bank logo"),
            FieldDefinition(name="bank_name", is_mutable=False, description="Name of the bank (JPMorgan Chase)"),
            FieldDefinition(name="bank_address", is_mutable=False, description="Bank address"),
            FieldDefinition(name="checking_summary_header", is_mutable=False, description="Header for checking summary"),
            FieldDefinition(name="deposits_header", is_mutable=False, description="Header for deposits section"),
            FieldDefinition(name="withdrawals_header", is_mutable=False, description="Header for withdrawals section"),
            FieldDefinition(name="daily_balance_header", is_mutable=False, description="Header for daily balance section"),
            FieldDefinition(name="customer_service", is_mutable=False, description="Customer service contact information"),
            FieldDefinition(name="footnotes", is_mutable=False, description="Footnotes and disclosures")
        ]
        statement_fields = StatementFields(fields=default_fields)
    
    log_path = os.path.join(SYNTHETIC_STAT_DIR, "template_fields.json")
    with open(log_path, 'w') as f:
        json.dump(statement_fields.model_dump(), f, indent=2)  # Use model_dump instead of dict
    
    return statement_fields


In [37]:
# Cell 8: Generate populated HTML and PDF
def generate_populated_html_and_pdf(df: pd.DataFrame, account_holder: str, template_dir: str, output_dir: str) -> list:
    import os
    import random
    from datetime import datetime
    
    # Initialize environment for templates
    env = Environment(loader=FileSystemLoader(template_dir))
    
    # Generate initial data
    initial_balance = round(random.uniform(1000, 20000), 2)
    deposits_total = sum(x for x in df['Amount'] if x > 0)
    withdrawals_total = abs(sum(x for x in df['Amount'] if x < 0))
    ending_balance = initial_balance + deposits_total - withdrawals_total
    service_fee = 25 if ending_balance < 5000 else 0
    if service_fee:
        withdrawals_total += service_fee
        ending_balance -= service_fee
    
    transactions_count = len(df)
    
    deposits = [
        {"date": row["Date"], "description": row["Description"], "amount": f"${row['Amount']:,.2f}"}
        for _, row in df.iterrows() if row['Amount'] > 0
    ]
    withdrawals = [
        {"date": row["Date"], "description": row["Description"], "amount": f"${abs(row['Amount']):,.2f}"}
        for _, row in df.iterrows() if row['Amount'] < 0
    ]
    daily_balances = []
    balance_dict = {}
    for _, row in df.iterrows():
        date = row["Date"]
        balance_dict[date] = row["Balance"]  # Already a float
    for date in sorted(balance_dict.keys()):
        daily_balances.append({"date": date, "amount": balance_dict[date]})  # Use float directly
    
    address = fake.address().replace('\n', '<br>')
    
    # Embed logo as base64
    logo_path = os.path.join(SAMPLE_LOGOS_DIR, BANK_LOGO)
    logo_data = ""
    if os.path.exists(logo_path):
        with open(logo_path, "rb") as img_file:
            logo_data = f"data:image/png;base64,{base64.b64encode(img_file.read()).decode('utf-8')}"
    
    min_date = datetime.strptime(min(df['Date']), "%m/%d").replace(year=2025).strftime("%B %d, %Y")
    max_date = datetime.strptime(max(df['Date']), "%m/%d").replace(year=2025).strftime("%B %d, %Y")
    
    template_data = {
        "account_holder": account_holder,
        "account_holder_address": address,
        "account_number": fake.bban()[:15],
        "statement_period": f"{min_date} through {max_date}",
        "summary": {
            "beginning_balance": f"${initial_balance:,.2f}",
            "deposits_count": len(deposits),
            "deposits_total": f"${deposits_total:,.2f}",
            "withdrawals_count": len(withdrawals) + (1 if service_fee else 0),
            "withdrawals_total": f"${withdrawals_total:,.2f}",
            "ending_balance": f"${ending_balance:,.2f}",
            "transactions_count": transactions_count
        },
        "deposits": deposits,
        "withdrawals": withdrawals,
        "daily_balances": daily_balances,
        "logo_path": logo_data,
        "show_fee_waiver": service_fee == 0
    }
    
    results = []
    # Loop through all templates in the directory
    for template_file in os.listdir(template_dir):
        if template_file.endswith('.html'):
            template = env.get_template(template_file)
            html_filename = os.path.join(output_dir, f"bank_statement_{account_holder.replace(' ', '_')}_{os.path.splitext(template_file)[0]}.html")
            pdf_filename = os.path.join(output_dir, f"bank_statement_{account_holder.replace(' ', '_')}_{os.path.splitext(template_file)[0]}.pdf")
            
            rendered_html = template.render(**template_data)
            
            with open(html_filename, 'w') as f:
                f.write(rendered_html)
            
            wkhtmltopdf_path = "C:\\Program Files\\wkhtmltopdf\\bin\\wkhtmltopdf.exe"  # Adjust if needed
            config = pdfkit.configuration(wkhtmltopdf=wkhtmltopdf_path)
            options = {
                "enable-local-file-access": "",
                "page-size": "Letter",
                "margin-top": "0.75in",
                "margin-right": "0.75in",
                "margin-bottom": "0.75in",
                "margin-left": "0.75in",
                "encoding": "UTF-8",
                "disable-javascript": "",
                "image-dpi": "300"
            }
            try:
                pdfkit.from_string(rendered_html, pdf_filename, configuration=config, options=options)
            except OSError as e:
                print(f"PDF generation failed for {template_file}: {e}. Ensure wkhtmltopdf is installed and accessible.")
                continue
            
            results.append((html_filename, pdf_filename))
    
    return results

In [38]:
# Cell 9: Main execution
if __name__ == "__main__":
    num_transactions = random.randint(3,12)
    account_holder = fake.company().upper()
    template_dir = TEMPLATES_DIR
    output_dir = SYNTHETIC_STAT_DIR
    
    df = generate_bank_statement(num_transactions, account_holder)
    csv_filename = os.path.join(SYNTHETIC_STAT_DIR, f"bank_statement_{account_holder.replace(' ', '_')}_chase.csv")
    df.to_csv(csv_filename, index=False)
    print(f"CSV saved as: {csv_filename}")
    
    statement_fields = identify_template_fields(os.path.join(TEMPLATES_DIR, "chase_final.html"))  # Use one template for field identification
    print("Identified Template Fields:")
    for field in statement_fields.fields:
        print(f"- {field.name}: {'Mutable' if field.is_mutable else 'Immutable'}, {field.description}")
    
    results = generate_populated_html_and_pdf(df, account_holder, template_dir, output_dir)
    for html_file, pdf_file in results:
        print(f"HTML saved as: {html_file}")
        print(f"PDF saved as: {pdf_file}")

C:\Users\cha\AppData\Local\Temp\ipykernel_16996\477407964.py:24: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.11/migration/
  return transaction.dict()


CSV saved as: synthetic_statements\bank_statement_HERNANDEZ-PEREZ_chase.csv
Ollama failed: Expecting value: line 1 column 1 (char 0). Using fallback fields.
Identified Template Fields:
- account_holder: Mutable, Name of the account holder
- account_holder_address: Mutable, Address of the account holder
- account_number: Mutable, Account number
- statement_period: Mutable, Statement date range
- summary: Mutable, Checking summary data (beginning balance, counts, totals)
- deposits: Mutable, List of deposit transactions
- withdrawals: Mutable, List of withdrawal transactions
- daily_balances: Mutable, Daily ending balances
- logo_path: Mutable, Path to the bank logo
- bank_name: Immutable, Name of the bank (JPMorgan Chase)
- bank_address: Immutable, Bank address
- checking_summary_header: Immutable, Header for checking summary
- deposits_header: Immutable, Header for deposits section
- withdrawals_header: Immutable, Header for withdrawals section
- daily_balance_header: Immutable, Header

In [39]:
# # Cell 10: Test Individual Template
# def test_individual_template(df: pd.DataFrame, account_holder: str, template_dir: str, template_name: str, output_dir: str) -> tuple[str, str]:
#     import os
#     from datetime import datetime
    
#     # Initialize environment for templates
#     env = Environment(loader=FileSystemLoader(template_dir))
    
#     # Generate initial data (reuse existing logic)
#     initial_balance = round(random.uniform(1000, 20000), 2)
#     deposits_total = sum(x for x in df['Amount'] if x > 0)
#     withdrawals_total = abs(sum(x for x in df['Amount'] if x < 0))
#     ending_balance = initial_balance + deposits_total - withdrawals_total
#     service_fee = 25 if ending_balance < 5000 else 0
#     if service_fee:
#         withdrawals_total += service_fee
#         ending_balance -= service_fee
    
#     transactions_count = len(df)
    
#     deposits = [
#         {"date": row["Date"], "description": row["Description"], "amount": f"${row['Amount']:,.2f}"}
#         for _, row in df.iterrows() if row['Amount'] > 0
#     ]
#     withdrawals = [
#         {"date": row["Date"], "description": row["Description"], "amount": f"${abs(row['Amount']):,.2f}"}
#         for _, row in df.iterrows() if row['Amount'] < 0
#     ]
#     daily_balances = []
#     balance_dict = {}
#     for _, row in df.iterrows():
#         date = row["Date"]
#         balance_dict[date] = row["Balance"]
#     for date in sorted(balance_dict.keys()):
#         daily_balances.append({"date": date, "amount": balance_dict[date]})
    
#     address = fake.address().replace('\n', '<br>')
    
#     # Embed logo as base64
#     logo_path = os.path.join(SAMPLE_LOGOS_DIR, BANK_LOGO)
#     logo_data = ""
#     if os.path.exists(logo_path):
#         with open(logo_path, "rb") as img_file:
#             logo_data = f"data:image/png;base64,{base64.b64encode(img_file.read()).decode('utf-8')}"
    
#     min_date = datetime.strptime(min(df['Date']), "%m/%d").replace(year=2025).strftime("%B %d, %Y")
#     max_date = datetime.strptime(max(df['Date']), "%m/%d").replace(year=2025).strftime("%B %d, %Y")
    
#     template_data = {
#         "account_holder": account_holder,
#         "account_holder_address": address,
#         "account_number": fake.bban()[:15],
#         "statement_period": f"{min_date} through {max_date}",
#         "summary": {
#             "beginning_balance": f"${initial_balance:,.2f}",
#             "deposits_count": len(deposits),
#             "deposits_total": f"${deposits_total:,.2f}",
#             "withdrawals_count": len(withdrawals) + (1 if service_fee else 0),
#             "withdrawals_total": f"${withdrawals_total:,.2f}",
#             "ending_balance": f"${ending_balance:,.2f}",
#             "transactions_count": transactions_count
#         },
#         "deposits": deposits,
#         "withdrawals": withdrawals,
#         "daily_balances": daily_balances,
#         "logo_path": logo_data,
#         "show_fee_waiver": service_fee == 0
#     }
    
#     # Load and render the specific template
#     if not os.path.exists(os.path.join(template_dir, template_name)):
#         raise FileNotFoundError(f"Template {template_name} not found in {template_dir}")
    
#     template = env.get_template(template_name)
#     html_filename = os.path.join(output_dir, f"test_bank_statement_{account_holder.replace(' ', '_')}_{os.path.splitext(template_name)[0]}.html")
#     pdf_filename = os.path.join(output_dir, f"test_bank_statement_{account_holder.replace(' ', '_')}_{os.path.splitext(template_name)[0]}.pdf")
    
#     rendered_html = template.render(**template_data)
    
#     with open(html_filename, 'w') as f:
#         f.write(rendered_html)
    
#     wkhtmltopdf_path = "C:\\Program Files\\wkhtmltopdf\\bin\\wkhtmltopdf.exe"  # Adjust if needed
#     config = pdfkit.configuration(wkhtmltopdf=wkhtmltopdf_path)
#     options = {
#         "enable-local-file-access": "",
#         "page-size": "Letter",
#         "margin-top": "0.75in",
#         "margin-right": "0.75in",
#         "margin-bottom": "0.75in",
#         "margin-left": "0.75in",
#         "encoding": "UTF-8",
#         "disable-javascript": "",
#         "image-dpi": "300",
#         "enable-forms": "",
#         "no-outline": ""
#     }
#     try:
#         pdfkit.from_string(rendered_html, pdf_filename, configuration=config, options=options)
#     except OSError as e:
#         print(f"PDF generation failed for {template_name}: {e}. Ensure wkhtmltopdf is installed and accessible.")
#         raise
    
#     return html_filename, pdf_filename

# # Example usage
# if __name__ == "__main__":
#     num_transactions = 12
#     account_holder = fake.company().upper()
#     template_dir = TEMPLATES_DIR
#     output_dir = SYNTHETIC_STAT_DIR
    
#     df = generate_bank_statement(num_transactions, account_holder)
#     csv_filename = os.path.join(SYNTHETIC_STAT_DIR, f"bank_statement_{account_holder.replace(' ', '_')}_chase.csv")
#     df.to_csv(csv_filename, index=False)
#     print(f"CSV saved as: {csv_filename}")
    
#     statement_fields = identify_template_fields(os.path.join(TEMPLATES_DIR, "chase_final.html"))
#     print("Identified Template Fields:")
#     for field in statement_fields.fields:
#         print(f"- {field.name}: {'Mutable' if field.is_mutable else 'Immutable'}, {field.description}")
    
#     # Test a specific template (e.g., 'web_style.html')
#     template_to_test = "chase_web_style_chat.html"  # Change this to any template name in TEMPLATES_DIR
#     html_file, pdf_file = test_individual_template(df, account_holder, template_dir, template_to_test, output_dir)
#     print(f"Test HTML saved as: {html_file}")
#     print(f"Test PDF saved as: {pdf_file}")