In [25]:
# Cell 1: Import libraries for data handling, random data, and PDF creation
import pandas as pd
import numpy as np
from faker import Faker
import random
from datetime import datetime, timedelta
import json
import ollama
from pydantic import BaseModel, Field
from typing import List
from reportlab.lib import colors
from reportlab.lib.pagesizes import letter
from reportlab.platypus import SimpleDocTemplate, Table, TableStyle, Paragraph, Spacer
from reportlab.lib.styles import getSampleStyleSheet
from reportlab.lib.units import inch
import os

# Initialize Faker for fake data
fake = Faker()


In [26]:
# Cell 2: Create directory


In [27]:

# Cell 3: Pydantic structured output for a transaction
class Transaction(BaseModel):
    description: str = Field(..., max_length=35, description="Transaction description, max 35 characters")
    category: str
    amount: float


In [28]:
# Cell 4: Generate category lists using LLM
def generate_category_lists() -> tuple[List[str], List[str]]:
    prompt = """
    Generate two lists of bank transaction categories in JSON format. One list for reasons someone loses money (e.g., utilities, subscriptions) and one for reasons someone gains money (e.g., gifts, deposits). Each list should have 5-7 unique categories, each 1-2 words, title case, no punctuation. Return as:
    {
      "loss_categories": ["Category One", "Category Two", ...],
      "gain_categories": ["Category One", "Category Two", ...]
    }
    """
    try:
        response = ollama.generate(model="mistral:7b-instruct-v0.3-q4_0", prompt=prompt)
        category_data = json.loads(response['response'].strip())
        loss_categories = category_data.get("loss_categories", [])
        gain_categories = category_data.get("gain_categories", [])
        # Validate categories
        loss_categories = [cat for cat in loss_categories if isinstance(cat, str) and 1 <= len(cat.split()) <= 2]
        gain_categories = [cat for cat in gain_categories if isinstance(cat, str) and 1 <= len(cat.split()) <= 2]
        if len(loss_categories) < 5 or len(gain_categories) < 5:
            raise ValueError("Insufficient valid categories")
    except (json.JSONDecodeError, ValueError):
        # Fallback categories if LLM fails or returns invalid data
        loss_categories = ["Utility Payment", "Subscription Fee", "Online Purchase", "Rent Payment", "Grocery Shopping", "Insurance Bill"]
        gain_categories = ["Salary Deposit", "Tax Refund", "Gift Received", "Client Payment", "Investment Return", "Cash Deposit"]
    return loss_categories, gain_categories

# Cell 5: Generate transaction description using LLM
def generate_transaction_description(amount: float, category: str) -> dict:
    prompt = f"""
    Generate a bank transaction description (3-5 words, max 25 characters) for a transaction in the '{category}' category.
    Rules:
    - Use title case (e.g., 'Grocery Store Purchase').
    - No punctuation (commas, periods, etc.).
    - No parentheses, dashes, or dollar signs.
    - No amounts or numbers as words.
    - No typos.
    - Use simple, clear phrases.
    - Examples: 'Grocery Store Purchase', 'Utility Bill Payment', 'Salary Direct Deposit'
    """
    try:
        response = ollama.generate(model="mistral:7b-instruct-v0.3-q4_0", prompt=prompt)
        description = response['response'].strip()[:25]
    except:
        description = f"{category} Transaction"  # Fallback if LLM fails

    # Clean description
    description = description.replace("(", "").replace(")", "").replace(",", "").replace(":", "").replace("-", "").replace("$", "").replace(".", "")
    description = ' '.join(word.capitalize() for word in description.split())[:25]

    # Validate word count
    words = description.split()
    if len(words) < 3 or len(words) > 5:
        description = f"{category} Transaction"[:25]  # Fallback to generic description

    transaction = Transaction(
        description=description,
        category=category,
        amount=amount
    )
    return transaction.dict()

In [29]:

# Cell 6: Generate the bank statement
def generate_bank_statement(num_transactions: int = 10, account_holder: str = "John Doe") -> pd.DataFrame:
    # Get category lists
    loss_categories, gain_categories = generate_category_lists()

    # Generate random dates within the last 90 days
    start_date = datetime.now() - timedelta(days=90)
    dates = [start_date + timedelta(days=random.randint(0, 90)) for _ in range(num_transactions)]

    # Create transactions
    transactions = []
    for _ in range(num_transactions):
        is_gain = random.choice([True, False])
        category = random.choice(gain_categories if is_gain else loss_categories)
        amount = round(random.uniform(50, 1000), 2) if is_gain else round(random.uniform(-500, -10), 2)
        transaction = generate_transaction_description(amount, category)
        transactions.append(transaction)

    # Create DataFrame
    data = {
        "Date": [d.strftime("%Y-%m-%d") for d in dates],
        "Description": [t["description"] for t in transactions],
        "Category": [t["category"] for t in transactions],
        "Amount": [t["amount"] for t in transactions],
        "Balance": [0.0] * num_transactions,
        "Account Holder": [account_holder] * num_transactions,
        "Transaction ID": [fake.bban() for _ in range(num_transactions)]
    }
    df = pd.DataFrame(data)

    # Sort by date and calculate running balance
    df = df.sort_values("Date")
    initial_balance = 1000.0
    df["Balance"] = initial_balance + df["Amount"].cumsum()

    return df


In [30]:

# Cell 7: Create PDF from the data
def generate_pdf(df: pd.DataFrame, account_holder: str, output_filename: str) -> str:
    doc = SimpleDocTemplate(
        output_filename,
        pagesize=letter,
        rightMargin=0.5*inch,
        leftMargin=0.5*inch,
        topMargin=0.75*inch,
        bottomMargin=0.5*inch
    )
    elements = []

    styles = getSampleStyleSheet()
    title = Paragraph(f"Bank Statement for {account_holder}", styles['Title'])
    elements.append(title)
    elements.append(Spacer(1, 0.25*inch))

    cell_style = styles['Normal']
    cell_style.fontSize = 7
    cell_style.leading = 9
    cell_style.wordWrap = 'CJK'

    # Format Amount and Balance
    df = df.copy()
    df['Amount'] = df['Amount'].apply(lambda x: f"{x:.2f}")
    df['Balance'] = df['Balance'].apply(lambda x: f"{x:.2f}")

    # Convert DataFrame to table data
    data = [df.columns.tolist()]
    for _, row in df.iterrows():
        row_data = [Paragraph(str(cell), cell_style) for cell in row]
        data.append(row_data)

    # Adjust column widths to accommodate longer descriptions
    page_width = letter[0] - (doc.leftMargin + doc.rightMargin)  # 540 points
    col_widths = [
        0.12 * page_width,  # Date
        0.3 * page_width,  # Description (increased from 0.30)
        0.14 * page_width,  # Category
        0.10 * page_width,  # Amount
        0.10 * page_width,  # Balance
        0.13 * page_width,  # Account Holder (reduced from 0.13)
        0.13 * page_width   # Transaction ID (reduced from 0.13)
    ]

    table = Table(data, colWidths=col_widths, splitByRow=True)
    table.setStyle(TableStyle([
        ('BACKGROUND', (0, 0), (-1, 0), colors.grey),
        ('TEXTCOLOR', (0, 0), (-1, 0), colors.whitesmoke),
        ('ALIGN', (0, 0), (-1, 0), 'CENTER'),
        ('ALIGN', (1, 1), (1, -1), 'LEFT'),
        ('FONTNAME', (0, 0), (-1, 0), 'Helvetica-Bold'),
        ('FONTSIZE', (0, 0), (-1, 0), 8),
        ('BOTTOMPADDING', (0, 0), (-1, 0), 6),
        ('TOPPADDING', (0, 1), (-1, -1), 4),
        ('BACKGROUND', (0, 1), (-1, -1), colors.beige),
        ('GRID', (0, 0), (-1, -1), 0.25, colors.black),
        ('FONTSIZE', (0, 1), (-1, -1), 7),
        ('VALIGN', (0, 0), (-1, -1), 'MIDDLE'),
    ]))

    elements.append(table)
    doc.build(elements)
    return output_filename


In [31]:

# Cell 8: Generate and Save Statement
num_transactions = 20
account_holder = "John Doe"

statement = generate_bank_statement(num_transactions, account_holder)
print("Bank Statement Preview:")
print(statement.head(5))

csv_filename = f"restructured_statements/bank_statement_{account_holder.replace(' ', '_')}.csv"
statement.to_csv(csv_filename, index=False)
print(f"CSV saved as: {csv_filename}")

pdf_filename = f"restructured_statements/bank_statement_{account_holder.replace(' ', '_')}.pdf"
generate_pdf(statement, account_holder, pdf_filename)
print(f"PDF saved as: {pdf_filename}")

C:\Users\cha\AppData\Local\Temp\ipykernel_4660\916404054.py:59: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.11/migration/
  return transaction.dict()


Bank Statement Preview:
          Date               Description          Category  Amount  Balance  \
19  2025-03-08  Online Clothing Shopping   Online Purchase -350.72   649.28   
11  2025-03-10      Gift Received [name]     Gift Received  742.09  1391.37   
15  2025-03-13     Salary Direct Deposit    Salary Deposit  984.39  2375.76   
2   2025-03-18         Tax Refund Issued        Tax Refund  459.89  2835.65   
17  2025-03-20  Grocery Shopping Expense  Grocery Shopping -317.02  2518.63   

   Account Holder      Transaction ID  
19       John Doe  IXXE95488709265275  
11       John Doe  TLPZ00400138047858  
15       John Doe  QXRT55290243477530  
2        John Doe  LGCT04863707850567  
17       John Doe  YBPF60784118544448  
CSV saved as: restructured_statements/bank_statement_John_Doe.csv
PDF saved as: restructured_statements/bank_statement_John_Doe.pdf
