Generating Synthetic Data (Bank Statements)

In [13]:
# Cell 1: Import libraries
import pandas as pd
import numpy as np
from faker import Faker
import random
from datetime import datetime, timedelta
import json
import ollama

fake = Faker()


In [10]:
# Cell 2: Function to generate descriptions with Mistral
def generate_transaction_description(amount):
    if amount > 0:
        prompt = f"Generate a realistic bank transaction description for a deposit of ${amount:.2f}. Examples: 'Salary from Acme Corp', 'Freelance payment', 'Refund from Store'."
    else:
        prompt = f"Generate a realistic bank transaction description for a withdrawal or purchase of ${-amount:.2f}. Examples: 'Grocery purchase at Whole Foods', 'Utility bill payment', 'ATM withdrawal'."
    response = ollama.generate(model="mistral:7b-instruct-v0.3-q4_0", prompt=prompt)
    return response['response'].strip()


In [11]:
# Cell 3: Updated bank statement generation function
def generate_bank_statement(num_transactions=50, account_holder=None, transaction_types=None):
    account_holder = account_holder or fake.name()
    transaction_types = transaction_types or ["Purchase", "Deposit", "Withdrawal"]

    # Generate dates
    start_date = datetime.now() - timedelta(days=90)
    dates = [start_date + timedelta(days=random.randint(0, 90)) for _ in range(num_transactions)]

    # Generate descriptions with Mistral (batched)
    prompt = f"""
    Generate {num_transactions} realistic bank transaction descriptions in JSON format.
    Each should have 'description', 'category', and 'amount'.
    Bias toward categories: {', '.join(transaction_types)}.
    Example: [
        {{"description": "Grocery purchase at Whole Foods", "category": "Groceries", "amount": -45.67}},
        {{"description": "Salary from Acme Corp", "category": "Deposit", "amount": 2000.00}}
    ]
    """
    response = ollama.generate(model="mistral:7b-instruct-v0.3-q4_0", prompt=prompt)
    transactions = json.loads(response['response'])

    # Create DataFrame
    data = {
        "Date": [d.strftime("%Y-%m-%d") for d in dates],
        "Description": [t["description"] for t in transactions[:num_transactions]],
        "Category": [t["category"] for t in transactions[:num_transactions]],
        "Amount": [t["amount"] for t in transactions[:num_transactions]],
        "Balance": [0] * num_transactions,
        "Account Holder": [account_holder] * num_transactions,
        "Account Number": [fake.bban() for _ in range(num_transactions)]
    }
    df = pd.DataFrame(data)
    
    # Sort and calculate balance
    df = df.sort_values("Date")
    initial_balance = random.uniform(1000, 5000)
    df["Balance"] = initial_balance + df["Amount"].cumsum()
    
    return df

In [14]:
# Cell 4: Generate
statement = generate_bank_statement(num_transactions=10, account_holder="John Doe")

In [16]:
# Cell 4: Preview

statement.sample(5)

Unnamed: 0,Date,Description,Category,Amount,Balance,Account Holder,Account Number
6,2025-04-27,Monthly subscription fee for Netflix,Entertainment,-13.99,3078.588475,John Doe,MCBR24936267826612
1,2025-06-02,Direct deposit of salary from TechnoCorp,Deposit,3500.0,6958.088475,John Doe,CDFG81575867947734
5,2025-04-23,Withdrawal from ATM for cash needs,Withdrawal,-200.0,3342.578475,John Doe,YUWI48312532956551
3,2025-03-19,Rent payment for the apartment,Housing,-1000.0,3754.918475,John Doe,MMUV90243219863151
8,2025-04-23,Loan repayment to Bank of America,Loan Repayment,-250.0,3092.578475,John Doe,BPFO41281248666551
