First, we will define the schema of transaction data. Transactions must contain entities. Who is sending/receiving money? Where is the money coming from and where is it going?

In [7]:
from faker import Faker
import pandas as pd
import numpy as np

# Initialize Faker
fake = Faker()

# Generate synthetic entities (individuals and businesses)
n_entities = 1000
entities = []
for i in range(n_entities):
    entity_type = np.random.choice(['Individual', 'Business'], p=[0.7, 0.3])
    if entity_type == 'Individual':
        name = fake.name()
        country = fake.country()
        account = fake.bban()
    else:
        name = fake.company()
        country = fake.country()
        account = fake.iban()
    entities.append({
        'Entity_ID': f'E{i:04d}',
        'Name': name,
        'Type': entity_type,
        'Country': country,
        'Account': account
    })

# Create DataFrame for entities
entities_df = pd.DataFrame(entities)

Next, we will simulate transaction patterns. These are normal transactions that occur over a set period of time. Amounts should vary to be relistic

In [8]:
from datetime import datetime, timedelta

# Parameters
n_transactions = 10000
start_date = datetime(2024, 1, 1)
end_date = datetime(2024, 12, 31)

# Generate legitimate transactions
transactions = []
for i in range(n_transactions):
    sender = np.random.choice(entities_df['Entity_ID'])
    receiver = np.random.choice(entities_df['Entity_ID'])
    while sender == receiver:  # Ensure sender != receiver
        receiver = np.random.choice(entities_df['Entity_ID'])
    
    # Transaction amount (log-normal distribution for realistic spread)
    amount = np.random.lognormal(mean=6, sigma=1.5)  # Mean ~$400, varies widely
    amount = round(max(10, min(amount, 100000)), 2)  # Cap between $10 and $100K
    
    # Random timestamp
    days = (end_date - start_date).days
    timestamp = start_date + timedelta(days=np.random.randint(0, days), 
                                      seconds=np.random.randint(0, 86400))
    
    # Transaction type and description
    trans_type = np.random.choice(['Wire Transfer', 'Cash Deposit', 'Payment'])
    description = fake.sentence(nb_words=5) if trans_type == 'Payment' else f"{trans_type} #{i}"
    
    transactions.append({
        'Transaction_ID': f'T{i:06d}',
        'Sender_ID': sender,
        'Receiver_ID': receiver,
        'Amount': amount,
        'Timestamp': timestamp,
        'Transaction_Type': trans_type,
        'Sender_Country': entities_df[entities_df['Entity_ID'] == sender]['Country'].iloc[0],
        'Receiver_Country': entities_df[entities_df['Entity_ID'] == receiver]['Country'].iloc[0],
        'Description': description,
        'Suspicious_Flag': False
    })

# Create DataFrame for transactions
transactions_df = pd.DataFrame(transactions)

Money laundering tactics mentioned in the Project Aurora's file include Money Mules and Smurfing.
Money mules are funds that are transfered across multiple accounts to hide the source of the funds.
Smurfing breaks large amounts of money into multiple smaller transactions to avoid being detected

In [9]:
# Select a subset of entities as money mules
money_mules = entities_df.sample(n=50)['Entity_ID'].tolist()

# Generate money mule transactions
n_mule_transactions = 500
mule_transactions = []
for i in range(n_mule_transactions):
    # Chain of 3-5 accounts for layering
    chain_length = np.random.randint(3, 6)
    chain = np.random.choice(money_mules, size=chain_length, replace=False)
    
    # Total illicit amount
    total_amount = np.random.uniform(5000, 50000)
    
    # Split into smaller transactions
    for j in range(chain_length - 1):
        amount = round(total_amount / (chain_length - 1) * np.random.uniform(0.8, 1.2), 2)
        timestamp = start_date + timedelta(days=np.random.randint(0, days), 
                                         seconds=np.random.randint(0, 3600))  # Rapid transfers
        mule_transactions.append({
            'Transaction_ID': f'MT{i:04d}_{j}',
            'Sender_ID': chain[j],
            'Receiver_ID': chain[j + 1],
            'Amount': min(amount, 9999.99),  # Keep under reporting threshold
            'Timestamp': timestamp,
            'Transaction_Type': 'Wire Transfer',
            'Sender_Country': entities_df[entities_df['Entity_ID'] == chain[j]]['Country'].iloc[0],
            'Receiver_Country': entities_df[entities_df['Entity_ID'] == chain[j + 1]]['Country'].iloc[0],
            'Description': 'Transfer',
            'Suspicious_Flag': True
        })

# Append to transactions DataFrame
mule_transactions_df = pd.DataFrame(mule_transactions)
transactions_df = pd.concat([transactions_df, mule_transactions_df], ignore_index=True)

In [10]:
# Generate smurfing transactions
n_smurf_transactions = 200
smurf_transactions = []
for i in range(n_smurf_transactions):
    total_amount = np.random.uniform(10000, 50000)
    n_splits = np.random.randint(3, 10)  # Split into 3-10 smaller transactions
    split_amount = total_amount / n_splits
    
    sender = np.random.choice(money_mules)
    for j in range(n_splits):
        receiver = np.random.choice(entities_df['Entity_ID'])
        while receiver == sender:
            receiver = np.random.choice(entities_df['Entity_ID'])
        
        timestamp = start_date + timedelta(days=np.random.randint(0, days), 
                                         seconds=np.random.randint(0, 3600))
        smurf_transactions.append({
            'Transaction_ID': f'ST{i:04d}_{j}',
            'Sender_ID': sender,
            'Receiver_ID': receiver,
            'Amount': min(round(split_amount * np.random.uniform(0.8, 1.2), 2), 9999.99),
            'Timestamp': timestamp,
            'Transaction_Type': 'Cash Deposit',
            'Sender_Country': entities_df[entities_df['Entity_ID'] == sender]['Country'].iloc[0],
            'Receiver_Country': entities_df[entities_df['Entity_ID'] == receiver]['Country'].iloc[0],
            'Description': 'Deposit',
            'Suspicious_Flag': True
        })

# Append to transactions DataFrame
smurf_transactions_df = pd.DataFrame(smurf_transactions)
transactions_df = pd.concat([transactions_df, smurf_transactions_df], ignore_index=True)

In [14]:
# Save to CSV in the a_graph folder
transactions_df.to_csv('synthetic_transactions.csv', index=False)

# Basic summary
print("Total Transactions:", len(transactions_df))
print("Suspicious Transactions:", len(transactions_df[transactions_df['Suspicious_Flag']]))
print("Unique Entities:", len(set(transactions_df['Sender_ID']) | set(transactions_df['Receiver_ID'])))
print("Country Distribution:\n", transactions_df['Sender_Country'].value_counts())

Total Transactions: 12686
Suspicious Transactions: 2686
Unique Entities: 1000
Country Distribution:
 Sender_Country
Comoros                    204
Nicaragua                  174
Belize                     169
Denmark                    163
Cocos (Keeling) Islands    155
                          ... 
Turkey                      11
Sierra Leone                10
Australia                   10
Rwanda                      10
Tuvalu                       5
Name: count, Length: 236, dtype: int64
