In [9]:
# Cell 1: Import libraries for data handling, random data, and PDF creation
import pandas as pd
import numpy as np
from faker import Faker
import random
from datetime import datetime, timedelta
import json
import ollama
from pydantic import BaseModel, Field
from typing import List
from reportlab.lib import colors
from reportlab.lib.pagesizes import letter
from reportlab.platypus import SimpleDocTemplate, Table, TableStyle, Paragraph, Spacer, Image
from reportlab.lib.styles import getSampleStyleSheet
from reportlab.lib.units import inch
import os

# Initialize Faker for fake data
fake = Faker()

# Cell 2: Create directory
import os

# Define the base directory and output directory
base_dir = r"C:\Users\cha\Desktop\SynthData\version3"
output_dir = os.path.join(base_dir, "synthetic_statements")

# Create the Synthetic_Statements directory if it doesn't exist
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

# Cell 3: Pydantic structured output for a transaction
class Transaction(BaseModel):
    description: str = Field(..., max_length=35, description="Transaction description, max 35 characters")
    category: str
    amount: float

# Cell 4: Generate category lists
def generate_category_lists() -> tuple[List[str], List[str]]:
    loss_categories = ["Utility Payment", "Subscription Fee", "Online Purchase", "Rent Payment", "Grocery Shopping", "Insurance Bill", "Loan Repayment"]
    gain_categories = ["Salary Deposit", "Tax Refund", "Gift Received", "Client Payment", "Investment Return", "Cash Deposit", "Item Refund"]
    return loss_categories, gain_categories

# Cell 5: Generate transaction description
def generate_transaction_description(amount: float, category: str) -> dict:
    description_dict = {
        "Utility Payment": "Utility Bill Payment",
        "Subscription Fee": "Monthly Subscription Fee",
        "Online Purchase": "Online Store Purchase",
        "Rent Payment": "Monthly Rent Payment",
        "Grocery Shopping": "Grocery Store Purchase",
        "Insurance Bill": "Insurance Premium Bill",
        "Loan Repayment": "Loan Repayment Installment",
        "Salary Deposit": "Payroll Direct Deposit",
        "Tax Refund": "Tax Refund Deposit",
        "Gift Received": "Gift Received Deposit",
        "Client Payment": "Client Invoice Payment",
        "Investment Return": "Investment Income Received",
        "Cash Deposit": "Cash Deposit Transaction",
        "Item Refund": "Refund Item Returned"
    }
    
    description = description_dict.get(category, f"{category} Transaction")[:35]
    
    description = description.replace("(", "").replace(")", "").replace(",", "").replace(":", "").replace("-", "").replace("$", "").replace(".", "")
    description = ' '.join(word.capitalize() for word in description.split())[:35]
    
    words = description.split()
    if len(words) < 3 or len(words) > 5:
        description = f"{category} Transaction"[:35]
    
    transaction = Transaction(
        description=description,
        category=category,
        amount=amount
    )
    return transaction.dict()

# Cell 6: Generate the bank statement
def generate_bank_statement(num_transactions: int = 12, account_holder: str = "John Doe") -> pd.DataFrame:
    loss_categories, gain_categories = generate_category_lists()
    start_date = datetime.now() - timedelta(days=90)
    dates = [start_date + timedelta(days=random.randint(0, 90)) for _ in range(num_transactions)]
    transactions = []
    for _ in range(num_transactions):
        is_gain = random.choice([True, False])
        category = random.choice(gain_categories if is_gain else loss_categories)
        amount = round(random.uniform(50, 1000), 2) if is_gain else round(random.uniform(-500, -10), 2)
        transaction = generate_transaction_description(amount, category)
        transactions.append(transaction)
    
    data = {
        "Date": [d.strftime("%Y-%m-%d") for d in dates],
        "Description": [t["description"] for t in transactions],
        "Category": [t["category"] for t in transactions],
        "Amount": [t["amount"] for t in transactions],
        "Balance": [0.0] * num_transactions,
        "Account Holder": [account_holder] * num_transactions,
        "Transaction ID": [fake.bban() for _ in range(num_transactions)]
    }
    df = pd.DataFrame(data)
    df = df.sort_values("Date")
    initial_balance = 1000.0
    df["Balance"] = initial_balance + df["Amount"].cumsum()
    return df



In [12]:
from reportlab.lib import colors
from reportlab.lib.pagesizes import letter
from reportlab.platypus import SimpleDocTemplate, Table, TableStyle, Paragraph, Spacer, Image
from reportlab.lib.styles import getSampleStyleSheet
from reportlab.lib.units import inch
from datetime import datetime
import os

# Define logo path
base_dir = r"C:\Users\cha\Desktop\SynthData\version3"
chase_logo_path = os.path.join(base_dir, "Sample_Logos", "Chase_Bank_Logo.png")

def generate_pdf_chase(df: pd.DataFrame, account_holder: str, output_filename: str) -> str:
    doc = SimpleDocTemplate(
        output_filename,
        pagesize=letter,
        rightMargin=0.5*inch,
        leftMargin=0.5*inch,
        topMargin=0.75*inch,
        bottomMargin=0.5*inch
    )
    elements = []

    styles = getSampleStyleSheet()
    
    # Custom styles with adjusted font sizes to match Chase example
    title_style = styles['Heading1']
    title_style.fontSize = 12
    title_style.fontName = 'Helvetica-Bold'
    
    detail_style = styles['Normal']
    detail_style.fontSize = 8
    detail_style.leading = 10
    
    contact_style = styles['Normal']
    contact_style.fontSize = 8
    contact_style.alignment = 2  # Right alignment
    contact_style.leading = 10
    
    heading_style = styles['Heading2']
    heading_style.fontName = 'Helvetica-Bold'
    heading_style.fontSize = 10
    heading_style.leftIndent = 0.25*inch  # Indentation for section headers

    # Header Section with Logo
    if os.path.exists(chase_logo_path):
        logo = Image(chase_logo_path, width=1.5*inch, height=0.5*inch)  # Match aspect ratio 1:3
    else:
        print(f"Logo not found at: {chase_logo_path}")
        logo = Paragraph("Chase Logo Not Found", detail_style)
    
    title = Paragraph("JPMorgan Chase Bank, N.A.", title_style)
    address = Paragraph("P.O. Box 659754, San Antonio, TX 78265-9754", detail_style)
    account_holder_info = Paragraph(f"{account_holder}<br/>2124 N Carroll Ave Suite 9<br/>Dallas, TX, 75204", detail_style)
    statement_period = Paragraph(f"{min(df['Date']).replace('-', '/')} through {max(df['Date']).replace('-', '/')}", detail_style)
    account_number = Paragraph(f"Account Number: {fake.bban()[:10]}", detail_style)

    contact_info = Paragraph(
        "CUSTOMER SERVICE INFORMATION<br/>"
        "Web site: chase.com<br/>"
        "Service Center: 1-800-242-7338<br/>"
        "Hearing Impaired: 1-800-242-7383<br/>"
        "Para Espanol: 1-888-622-4273<br/>"
        "International Calls: 1-713-262-1679",
        contact_style
    )
    contact_info_table = Table([[contact_info]], colWidths=[2.5*inch])
    contact_info_table.setStyle(TableStyle([
        ('BOX', (0, 0), (-1, -1), 1, colors.black),
        ('ALIGN', (0, 0), (-1, -1), 'RIGHT'),
        ('TOPPADDING', (0, 0), (-1, -1), 5),
        ('BOTTOMPADDING', (0, 0), (-1, -1), 5),
    ]))
    
    # Header layout with logo in top left
    header_data = [
        [logo, ""],
        [title, statement_period],
        [address, account_number],
        [account_holder_info, contact_info_table]
    ]
    header_table = Table(header_data, colWidths=[3*inch, 3*inch], rowHeights=[0.5*inch, 0.2*inch, 0.2*inch, 0.4*inch])
    header_table.setStyle(TableStyle([
        ('VALIGN', (0, 0), (-1, -1), 'TOP'),
        ('ALIGN', (1, 0), (1, -1), 'RIGHT'),
        ('ALIGN', (0, 0), (0, 0), 'LEFT'),
        ('TOPPADDING', (0, 0), (-1, -1), 5),
        ('BOTTOMPADDING', (0, 0), (-1, -1), 5),
        ('LEFTPADDING', (0, 0), (0, -1), 0),
        ('LEFTPADDING', (1, 0), (1, -1), 0.25*inch),
    ]))
    
    # Checking Summary
    initial_balance = 1000.0  # Match OCR starting balance
    deposits_total = sum(x for x in df['Amount'] if x > 0)
    withdrawals_total = abs(sum(x for x in df['Amount'] if x < 0))
    ending_balance = initial_balance + deposits_total - withdrawals_total
    
    summary_data = [
        ["", "INSTANCES", "AMOUNT"],
        ["Beginning Balance", 1, f"${initial_balance:,.2f}"],
        ["Deposits and Additions", len([x for x in df['Amount'] if x > 0]), f"${deposits_total:,.2f}"],
        ["Electronic Withdrawals", len([x for x in df['Amount'] if x < 0]), f"${withdrawals_total:,.2f}"],
        ["Ending Balance", 1, f"${ending_balance:,.2f}"]
    ]
    summary_table = Table(summary_data, colWidths=[2.5*inch, 0.75*inch, 1.25*inch])
    summary_table.setStyle(TableStyle([
        ('GRID', (0, 0), (-1, -1), 0.25, colors.black),
        ('ALIGN', (0, 0), (-1, -1), 'LEFT'),
        ('ALIGN', (1, 1), (1, -1), 'CENTER'),
        ('ALIGN', (2, 1), (2, -1), 'RIGHT'),
        ('FONTSIZE', (0, 0), (-1, -1), 8),
        ('BACKGROUND', (0, 0), (-1, 0), colors.lightgrey),
        ('TEXTCOLOR', (0, 0), (-1, 0), colors.black),
        ('LEFTPADDING', (0, 0), (-1, -1), 0.25*inch),
    ]))
    
    waiver_note = Paragraph(
        "Your monthly service fee was waived because you maintained an average checking balance of $7,500.00 or a minimum checking balance of $5,000.00 or more during the statement period.",
        detail_style
    )

    # Deposits and Additions
    deposits_data = [["DATE", "DESCRIPTION", "AMOUNT"]]
    for _, row in df.iterrows():
        if row['Amount'] > 0:
            deposits_data.append([row['Date'].replace('-', '/'), row['Description'], f"${row['Amount']:,.2f}"])
    deposits_table = Table(deposits_data, colWidths=[0.75*inch, 2.5*inch, 1*inch])
    deposits_table.setStyle(TableStyle([
        ('GRID', (0, 0), (-1, -1), 0.25, colors.black),
        ('ALIGN', (0, 0), (-1, -1), 'LEFT'),
        ('ALIGN', (2, 1), (2, -1), 'RIGHT'),
        ('FONTSIZE', (0, 0), (-1, -1), 8),
        ('BACKGROUND', (0, 0), (-1, 0), colors.lightgrey),
        ('TEXTCOLOR', (0, 0), (-1, 0), colors.black),
        ('LEFTPADDING', (0, 0), (-1, -1), 0.25*inch),
    ]))

    # Withdrawals
    withdrawals_data = [["DATE", "DESCRIPTION", "AMOUNT"]]
    for _, row in df.iterrows():
        if row['Amount'] < 0:
            withdrawals_data.append([row['Date'].replace('-', '/'), row['Description'], f"${abs(row['Amount']):,.2f}"])
    withdrawals_table = Table(withdrawals_data, colWidths=[0.75*inch, 2.5*inch, 1*inch])
    withdrawals_table.setStyle(TableStyle([
        ('GRID', (0, 0), (-1, -1), 0.25, colors.black),
        ('ALIGN', (0, 0), (-1, -1), 'LEFT'),
        ('ALIGN', (2, 1), (2, -1), 'RIGHT'),
        ('FONTSIZE', (0, 0), (-1, -1), 8),
        ('BACKGROUND', (0, 0), (-1, 0), colors.lightgrey),
        ('TEXTCOLOR', (0, 0), (-1, 0), colors.black),
        ('LEFTPADDING', (0, 0), (-1, -1), 0.25*inch),
    ]))

    # Daily Ending Balance
    daily_balance_data = [["DATE", "AMOUNT"]]
    for _, row in df.iterrows():
        daily_balance_data.append([row['Date'].replace('-', '/'), f"${row['Balance']:,.2f}"])
    daily_balance_table = Table(daily_balance_data, colWidths=[0.75*inch, 1.25*inch], repeatRows=1)  # repeatRows for pagination
    daily_balance_table.setStyle(TableStyle([
        ('GRID', (0, 0), (-1, -1), 0.25, colors.black),
        ('ALIGN', (0, 0), (-1, -1), 'LEFT'),
        ('ALIGN', (1, 1), (1, -1), 'RIGHT'),
        ('FONTSIZE', (0, 0), (-1, -1), 8),
        ('BACKGROUND', (0, 0), (-1, 0), colors.lightgrey),
        ('TEXTCOLOR', (0, 0), (-1, 0), colors.black),
        ('LEFTPADDING', (0, 0), (-1, -1), 0.25*inch),
    ]))

    # Assemble elements with adjusted indentation
    elements.append(header_table)
    elements.append(Spacer(1, 0.2*inch))
    
    elements.append(Paragraph("CHECKING SUMMARY", heading_style))
    elements.append(Paragraph("Chase Business Select Checking", detail_style))
    elements.append(Spacer(1, 0.1*inch))
    elements.append(summary_table)
    elements.append(Spacer(1, 0.1*inch))
    elements.append(waiver_note)
    elements.append(Spacer(1, 0.2*inch))
    
    elements.append(Paragraph("DEPOSITS AND ADDITIONS", heading_style))
    elements.append(Spacer(1, 0.1*inch))
    elements.append(deposits_table)
    elements.append(Spacer(1, 0.2*inch))
    
    elements.append(Paragraph("WITHDRAWALS", heading_style))
    elements.append(Spacer(1, 0.1*inch))
    elements.append(withdrawals_table)
    elements.append(Spacer(1, 0.2*inch))
    
    elements.append(Paragraph("DAILY ENDING BALANCE", heading_style))
    elements.append(Spacer(1, 0.1*inch))
    elements.append(daily_balance_table)

    doc.build(elements)
    return output_filename

In [13]:
num_transactions = 12
account_holder = "John Doe"

statement = generate_bank_statement(num_transactions, account_holder)
print("Bank Statement Preview:")
print(statement.head(5))

csv_filename = os.path.join(output_dir, f"bank_statement_{account_holder.replace(' ', '_')}.csv")
statement.to_csv(csv_filename, index=False)
print(f"CSV saved as: {csv_filename}")

pdf_filename_chase = os.path.join(output_dir, f"bank_statement_{account_holder.replace(' ', '_')}_chase.pdf")
generate_pdf_chase(statement, account_holder, pdf_filename_chase)
print(f"Chase PDF saved as: {pdf_filename_chase}")

Bank Statement Preview:
          Date                 Description        Category  Amount  Balance  \
2   2025-03-15          Tax Refund Deposit      Tax Refund  539.74  1539.74   
1   2025-03-16      Client Invoice Payment  Client Payment  339.64  1879.38   
5   2025-03-23    Cash Deposit Transaction    Cash Deposit  738.09  2617.47   
7   2025-03-23  Loan Repayment Installment  Loan Repayment -497.83  2119.64   
10  2025-04-07        Monthly Rent Payment    Rent Payment -477.88  1641.76   

   Account Holder      Transaction ID  
2        John Doe  WVRZ02611609408439  
1        John Doe  IPDT99090639989427  
5        John Doe  OAIH95192059144614  
7        John Doe  WGDX27579464252503  
10       John Doe  JDAO74135536750980  
CSV saved as: C:\Users\cha\Desktop\SynthData\version3\synthetic_statements\bank_statement_John_Doe.csv
Chase PDF saved as: C:\Users\cha\Desktop\SynthData\version3\synthetic_statements\bank_statement_John_Doe_chase.pdf


C:\Users\cha\AppData\Local\Temp\ipykernel_13264\4271723474.py:77: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.11/migration/
  return transaction.dict()
