In [42]:
import pandas as pd
import numpy as np
from faker import Faker
import random
from datetime import datetime, timedelta
import os
import csv

# Initialize Faker
fake = Faker()

# Set random seed for reproducibility
random.seed(42)
np.random.seed(42)


In [48]:
# Parameters based on Project Aurora
num_individuals = 800  # Number of individual accounts
num_businesses = 200   # Number of business accounts
num_transactions = 5000  # Total transactions
transaction_types = ['cash', 'wire', 'crypto', 'international']

# Curated list of real countries
countries = [
    'United States', 'Germany', 'Brazil', 'Japan', 'India', 'France', 'Canada',
    'Australia', 'South Africa', 'Mexico', 'United Kingdom', 'China', 'Italy', 'Spain'
]

# Generate entities (individuals and businesses)
individuals = [
    {
        'entity_id': f'I{str(i).zfill(4)}',
        'entity_type': 'individual',
        'name': fake.last_name(),
        'country': random.choice(countries)
    }
    for i in range(num_individuals)
]

business_names = [
    'Acme', 'Nexlify', 'Techtron', 'Globex', 'Innovex', 'Quantum', 'Synergy',
    'Vertex', 'OmniCorp', 'BrightPeak', 'Nexus', 'Strive', 'Pinnacle', 'CoreZap'
] * (num_businesses // 14 + 1)

businesses = [
    {
        'entity_id': f'B{str(i).zfill(4)}',
        'entity_type': 'business',
        'name': business_names[i],
        'country': random.choice(countries)
    }
    for i in range(num_businesses)
]

entities = individuals + businesses
entities_df = pd.DataFrame(entities)

# Validate entities_df
for idx, row in entities_df.iterrows():
    if len(row) != 4:
        print(f"Warning: Row {idx} in entities_df has {len(row)} columns: {row}")
    if any(c in row['name'] for c in [',', '"', ' ']):
        print(f"Warning: Row {idx} in entities_df has invalid characters in name: {row['name']}")

# Generate transactions with money laundering patterns
def generate_transaction(entities_df, idx, is_ml=False):
    valid_entity_ids = entities_df['entity_id'].values
    sender = np.random.choice(valid_entity_ids)
    
    receiver_candidates = entities_df[entities_df['entity_id'] != sender]['entity_id'].values
    if len(receiver_candidates) == 0:
        print(f"Warning: No valid receivers for sender {sender}")
        return None
    
    receiver = np.random.choice(receiver_candidates)
    
    if sender.startswith('T') or receiver.startswith('T'):
        print(f"Error: Invalid ID generated - sender: {sender}, receiver: {receiver}")
        return None
    
    if is_ml:
        amount = np.random.uniform(50, 900)  # Smurfing: small transactions
    else:
        amount = np.random.uniform(100, 10000)  # Normal transactions
    
    transaction_type = random.choice(transaction_types)
    timestamp = fake.date_time_between(start_date='-30d', end_date='now')
    
    sender_country = entities_df[entities_df['entity_id'] == sender]['country'].iloc[0]
    receiver_country = entities_df[entities_df['entity_id'] == receiver]['country'].iloc[0]
    is_cross_border = sender_country != receiver_country
    
    ml_flag = 1 if is_ml else 0
    
    return {
        'sender_id': sender,
        'receiver_id': receiver,
        'transaction_id': f'T{str(idx).zfill(5)}',
        'amount': round(amount, 2),
        'transaction_type': transaction_type,
        'timestamp': timestamp,
        'is_cross_border': is_cross_border,
        'ml_flag': ml_flag
    }

# Generate transactions
transactions = []
ml_transaction_count = int(num_transactions * 0.1)
i = 0
while len(transactions) < num_transactions:
    is_ml = i < ml_transaction_count
    if is_ml and i % 10 == 0:  # Simulate money mule network
        for _ in range(5):
            tx = generate_transaction(entities_df, len(transactions), is_ml=True)
            if tx:
                transactions.append(tx)
    else:
        tx = generate_transaction(entities_df, len(transactions), is_ml)
        if tx:
            transactions.append(tx)
    i += 1

# Create transactions_df with correct column order
transactions_df = pd.DataFrame(transactions, columns=[
    'sender_id', 'receiver_id', 'transaction_id', 'amount', 'transaction_type',
    'timestamp', 'is_cross_border', 'ml_flag'
])

# Validate transactions_df
valid_entity_ids = set(entities_df['entity_id'])
for idx, row in transactions_df.iterrows():
    if len(row) != 8:
        print(f"Warning: Row {idx} in transactions_df has {len(row)} columns: {row}")
    if row['sender_id'] not in valid_entity_ids:
        print(f"Error: Invalid sender_id {row['sender_id']} in transaction {row['transaction_id']}")
    if row['receiver_id'] not in valid_entity_ids:
        print(f"Error: Invalid receiver_id {row['receiver_id']} in transaction {row['transaction_id']}")
    if row['sender_id'].startswith('T') or row['receiver_id'].startswith('T'):
        print(f"Error: Transaction-like ID in transaction {row['transaction_id']}: sender_id={row['sender_id']}, receiver_id={row['receiver_id']}")

# Save to CSV with quoting
entities_df.to_csv('entities.csv', index=False, quoting=csv.QUOTE_ALL)
transactions_df.to_csv('transactions.csv', index=False, quoting=csv.QUOTE_ALL)

print(f"Synthetic data generated: entities.csv ({len(entities_df)} entities) and transactions.csv ({len(transactions_df)} transactions)")

Synthetic data generated: entities.csv (1000 entities) and transactions.csv (5000 transactions)


In [50]:
import kuzu
import os

# Initialize Kuzu database
db = kuzu.Database('kuzu_db')
conn = kuzu.Connection(db)



In [51]:

# Create node and relationship tables
conn.execute("""
    CREATE NODE TABLE Entity(
        entity_id STRING,
        entity_type STRING,
        name STRING,
        country STRING,
        PRIMARY KEY (entity_id)
    )
""")

conn.execute("""
    CREATE REL TABLE Transaction(
        FROM Entity TO Entity,
        transaction_id STRING,
        amount DOUBLE,
        transaction_type STRING,
        timestamp TIMESTAMP,
        is_cross_border BOOLEAN,
        ml_flag INT
    )
""")


<kuzu.query_result.QueryResult at 0x1e300a5b690>

In [52]:
# Load entities from CSV with explicit CSV options

conn.execute("""
    COPY Entity FROM 'entities.csv' (HEADER=TRUE, DELIM=',', QUOTE='"')
""")

# Load transactions from CSV with explicit CSV options
conn.execute("""
    COPY Transaction FROM 'transactions.csv' (HEADER=TRUE, DELIM=',', QUOTE='"')
""")

<kuzu.query_result.QueryResult at 0x1e300a5b700>

In [53]:

# Example Cypher queries for analysis
# Query 1: Find high-degree nodes (potential money mules)
result = conn.execute("""
    MATCH (e:Entity)-[t:Transaction]->()
    RETURN e.entity_id, e.name, e.country, COUNT(t) AS degree
    ORDER BY degree DESC
    LIMIT 10
""")
print("\nTop 10 high-degree nodes:")
while result.has_next():
    row = result.get_next()
    print(row)



Top 10 high-degree nodes:
['I0132', 'Serrano', 'Spain', 16]
['B0155', 'Nexlify', 'Italy', 13]
['I0732', 'Cole', 'India', 13]
['I0684', 'Edwards', 'China', 13]
['I0106', 'Turner', 'Brazil', 13]
['I0065', 'Ward', 'France', 12]
['I0367', 'Quinn', 'Italy', 12]
['I0250', 'Ramirez', 'France', 12]
['I0218', 'Kane', 'United Kingdom', 12]
['I0589', 'Montgomery', 'China', 12]


In [56]:

# Query 3: Detect smurfing (small transactions below threshold)
result = conn.execute("""
    MATCH (e:Entity)-[t:Transaction]->()
    WHERE t.amount < 1000
    RETURN e.entity_id, COUNT(t) AS small_tx_count
    ORDER BY small_tx_count DESC
    LIMIT 10
""")
print("\nPotential smurfing (small transactions):")
while result.has_next():
    row = result.get_next()
    print(row)


Potential smurfing (small transactions):
['I0521', 5]
['I0086', 5]
['I0697', 5]
['I0296', 5]
['B0063', 4]
['B0163', 4]
['I0758', 4]
['I0367', 4]
['I0589', 4]
['I0252', 4]
