In [3]:
from faker import Faker
import pandas as pd
import numpy as np
from datetime import datetime, timedelta

# Initialize Faker
fake = Faker()

# Generate entities
n_entities = 1000
entities = []
for i in range(n_entities):
    entity_type = np.random.choice(['Individual', 'Business'], p=[0.7, 0.3])
    entities.append({
        'Entity_ID': f'E{i:04d}',  # Unique IDs: E0000 to E0999
        'Name': fake.name() if entity_type == 'Individual' else fake.company(),
        'Type': entity_type,
        'Country': fake.country(),
        'Account': fake.bban() if entity_type == 'Individual' else fake.iban()
    })
entities_df = pd.DataFrame(entities)

# Verify uniqueness
if entities_df['Entity_ID'].duplicated().any():
    raise ValueError("Duplicate Entity_IDs in entities_df")

# Save entities
entities_df.to_csv('data_entities.csv', index=False, encoding='utf-8')
print(f"Generated data_entities.csv with {len(entities_df)} entities")


Generated data_entities.csv with 1000 entities
Generated data_synthetic_transactions.csv with 12714 transactions


In [1]:


from faker import Faker
import pandas as pd
import numpy as np
from datetime import datetime, timedelta

fake = Faker()
n_entities = 1000
entities = [{'Entity_ID': f'E{i:04d}', 'Name': fake.name() if np.random.choice([0,1]) else fake.company(), 
             'Type': 'Individual' if np.random.choice([0,1]) else 'Business', 'Country': fake.country(), 
             'Account': fake.bban() if np.random.choice([0,1]) else fake.iban()} for i in range(n_entities)]
entities_df = pd.DataFrame(entities)
entities_df.to_csv('data_entities.csv', index=False, encoding='utf-8')
print(f"Generated data_entities.csv with {len(entities_df)} entities")

n_transactions = 10000
start_date = datetime(2024, 1, 1)
end_date = datetime(2024, 12, 31)
days = (end_date - start_date).days
entity_ids = entities_df['Entity_ID'].tolist()
transactions = []
for i in range(n_transactions):
    sender = np.random.choice(entity_ids)
    receiver = np.random.choice(entity_ids)
    while sender == receiver:
        receiver = np.random.choice(entity_ids)
    amount = round(max(10, min(np.random.lognormal(mean=6, sigma=1.5), 100000)), 2)
    timestamp = start_date + timedelta(days=np.random.randint(0, days), seconds=np.random.randint(0, 86400))
    trans_type = np.random.choice(['Wire Transfer', 'Cash Deposit', 'Payment'])
    description = fake.sentence(nb_words=5) if trans_type == 'Payment' else f"{trans_type} #{i}"
    transactions.append({
        'Transaction_ID': f'T{i:06d}',
        'Sender_ID': sender,
        'Receiver_ID': receiver,
        'Amount': amount,
        'Timestamp': timestamp.isoformat(),
        'Transaction_Type': trans_type,
        'Sender_Country': entities_df[entities_df['Entity_ID'] == sender]['Country'].iloc[0],
        'Receiver_Country': entities_df[entities_df['Entity_ID'] == receiver]['Country'].iloc[0],
        'Description': description,
        'Suspicious_Flag': False
    })

money_mules = entities_df.sample(n=50, random_state=42)['Entity_ID'].tolist()
n_mule_transactions = 500
for i in range(n_mule_transactions):
    chain_length = np.random.randint(3, 6)
    chain = np.random.choice(money_mules, size=chain_length, replace=False)
    total_amount = np.random.uniform(5000, 50000)
    for j in range(chain_length - 1):
        amount = min(round(total_amount / (chain_length - 1) * np.random.uniform(0.8, 1.2), 2), 9999.99)
        timestamp = start_date + timedelta(days=np.random.randint(0, days), seconds=np.random.randint(0, 3600))
        transactions.append({
            'Transaction_ID': f'MT{i:04d}_{j}',
            'Sender_ID': chain[j],
            'Receiver_ID': chain[j + 1],
            'Amount': amount,
            'Timestamp': timestamp.isoformat(),
            'Transaction_Type': 'Wire Transfer',
            'Sender_Country': entities_df[entities_df['Entity_ID'] == chain[j]]['Country'].iloc[0],
            'Receiver_Country': entities_df[entities_df['Entity_ID'] == chain[j + 1]]['Country'].iloc[0],
            'Description': 'Transfer',
            'Suspicious_Flag': True
        })

n_smurf_transactions = 200
for i in range(n_smurf_transactions):
    total_amount = np.random.uniform(10000, 50000)
    n_splits = np.random.randint(3, 10)
    split_amount = total_amount / n_splits
    sender = np.random.choice(money_mules)
    for j in range(n_splits):
        receiver = np.random.choice(entity_ids)
        while receiver == sender:
            receiver = np.random.choice(entity_ids)
        amount = min(round(split_amount * np.random.uniform(0.8, 1.2), 2), 9999.99)
        timestamp = start_date + timedelta(days=np.random.randint(0, days), seconds=np.random.randint(0, 3600))
        transactions.append({
            'Transaction_ID': f'ST{i:04d}_{j}',
            'Sender_ID': sender,
            'Receiver_ID': receiver,
            'Amount': amount,
            'Timestamp': timestamp.isoformat(),
            'Transaction_Type': 'Cash Deposit',
            'Sender_Country': entities_df[entities_df['Entity_ID'] == sender]['Country'].iloc[0],
            'Receiver_Country': entities_df[entities_df['Entity_ID'] == receiver]['Country'].iloc[0],
            'Description': 'Deposit',
            'Suspicious_Flag': True
        })

transactions_df = pd.DataFrame(transactions)
if transactions_df['Transaction_ID'].duplicated().any():
    raise ValueError("Duplicate Transaction_IDs")
invalid_ids = set(transactions_df['Sender_ID']).union(transactions_df['Receiver_ID']) - set(entities_df['Entity_ID'])
if invalid_ids:
    raise ValueError(f"Invalid IDs: {invalid_ids}")
transactions_df.to_csv('data_synthetic_transactions.csv', index=False, encoding='utf-8')
print(f"Generated data_synthetic_transactions.csv with {len(transactions_df)} transactions")


Generated data_entities.csv with 1000 entities
Generated data_synthetic_transactions.csv with 12656 transactions


In [2]:


# Create transactions DataFrame
transactions_df = pd.DataFrame(transactions)

# Verify uniqueness and validity
if transactions_df['Transaction_ID'].duplicated().any():
    raise ValueError("Duplicate Transaction_IDs")
invalid_ids = set(transactions_df['Sender_ID']).union(transactions_df['Receiver_ID']) - set(entities_df['Entity_ID'])
if invalid_ids:
    raise ValueError(f"Invalid IDs: {invalid_ids}")

# Save transactions
transactions_df.to_csv('data_synthetic_transactions.csv', index=False, encoding='utf-8')
print(f"Generated data_synthetic_transactions.csv with {len(transactions_df)} transactions")

Generated data_synthetic_transactions.csv with 12656 transactions
