Cell 1: Imports

In [None]:
import pandas as pd
import numpy as np
import networkx as nx
import kuzu
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import precision_score, recall_score, f1_score, roc_curve, precision_recall_curve, auc
from sklearn.model_selection import KFold, StratifiedKFold
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import os
import shutil
import scipy.sparse as sp

Cell 2: Synthetic Data

In [None]:
def generate_synthetic_data(num_entities=5000, num_transactions=25000):
    np.random.seed(42)
    # Step 1: Benign data
    countries = np.random.choice(['US', 'EU', 'ASIA', 'HIGH_RISK'], num_entities, p=[0.4, 0.3, 0.2, 0.1])
    entities = pd.DataFrame({
        'entity_id': [f'E{i:04d}' for i in range(num_entities)],
        'profile_type': np.random.choice(['individual_low', 'individual_high', 'business_small', 'business_large'], num_entities, p=[0.4, 0.1, 0.3, 0.2]),
        'country': countries,
        'agent_id': np.random.randint(1, 100, num_entities),
        'kyc_risk_score': np.random.uniform(0, 0.3, num_entities),
        'dormancy_period': np.random.randint(0, 365, num_entities)
    })
    G = nx.barabasi_albert_graph(num_entities, 5, seed=42)  # Benign network
    edges = [(f'E{u:04d}', f'E{v:04d}') for u, v in G.edges()]
    transactions = pd.DataFrame({
        'sender_id': [edges[i % len(edges)][0] for i in range(num_transactions)],
        'receiver_id': [edges[i % len(edges)][1] for i in range(num_transactions)],
        'amount': np.random.exponential(50000, num_transactions).clip(100, 1500000),
        'timestamp': np.random.randint(0, 30, num_transactions),
        'ml_flag': np.zeros(num_transactions, dtype=int),
        'typology': ['benign'] * num_transactions  # Track typology
    })
    # Add cross-border flag based on countries
    transactions['is_cross_border'] = transactions.apply(lambda row: entities.loc[entities['entity_id'] == row['sender_id'], 'country'].values[0] != entities.loc[entities['entity_id'] == row['receiver_id'], 'country'].values[0], axis=1)
    transactions['high_risk_jurisdiction'] = transactions.apply(lambda row: 'HIGH_RISK' in [entities.loc[entities['entity_id'] == row['sender_id'], 'country'].values[0], entities.loc[entities['entity_id'] == row['receiver_id'], 'country'].values[0]], axis=1)
    transactions['flagged_receiver'] = np.random.choice([True, False], num_transactions, p=[0.02, 0.98])

    # Step 2: Inject typologies (1% suspicious entities, as per low ML estimates)
    num_suspicious = int(0.01 * num_entities)
    suspicious_ids = np.random.choice(entities['entity_id'], num_suspicious, replace=False)
    for sid in suspicious_ids:
        # Common suspicious traits
        entities.loc[entities['entity_id'] == sid, 'kyc_risk_score'] = np.random.uniform(0.6, 0.9)
        entities.loc[entities['entity_id'] == sid, 'dormancy_period'] = np.random.randint(200, 365)
        ts = np.random.randint(0, 3)
        
        # Typology 1: Smurfing (split large amount into small tx to many receivers)
        large_amount = np.random.uniform(200000, 800000)
        num_smurfs = np.random.randint(10, 20)
        small_amounts = np.full(num_smurfs, large_amount / num_smurfs)
        receivers = np.random.choice(entities['entity_id'], num_smurfs, replace=False)
        smurf_tx = pd.DataFrame({
            'sender_id': [sid] * num_smurfs,
            'receiver_id': receivers,
            'amount': small_amounts,
            'timestamp': [ts] * num_smurfs,
            'ml_flag': 1,
            'is_cross_border': np.random.choice([True, False], num_smurfs, p=[0.7, 0.3]),
            'high_risk_jurisdiction': np.random.choice([True, False], num_smurfs, p=[0.5, 0.5]),
            'flagged_receiver': np.random.choice([True, False], num_smurfs, p=[0.3, 0.7]),
            'typology': ['smurfing'] * num_smurfs
        })
        transactions = pd.concat([transactions, smurf_tx], ignore_index=True)
        
        # Typology 2: Money Mules (recruit mules to forward funds)
        num_mules = np.random.randint(3, 6)
        mules = np.random.choice(entities['entity_id'], num_mules, replace=False)
        mule_amount = np.random.uniform(5000, 50000)
        for mule in mules:
            # Suspicious -> Mule -> Final (back or to high-risk)
            final = np.random.choice(entities[entities['country'] == 'HIGH_RISK']['entity_id'], 1)[0] if np.random.rand() > 0.5 else sid  # Round-trip
            mule_tx = pd.DataFrame({
                'sender_id': [sid, mule],
                'receiver_id': [mule, final],
                'amount': [mule_amount] * 2,
                'timestamp': [ts, ts + 1],
                'ml_flag': 1,
                'is_cross_border': [False, True],
                'high_risk_jurisdiction': [False, True],
                'flagged_receiver': [True, False],
                'typology': ['money_mule'] * 2
            })
            transactions = pd.concat([transactions, mule_tx], ignore_index=True)
        
        # Typology 3: CLS (complex layering: multi-hop chains/loops)
        layers = np.random.randint(4, 7)
        chain = [sid] + list(np.random.choice(entities['entity_id'], layers - 1, replace=False)) + [sid if np.random.rand() > 0.5 else np.random.choice(entities['entity_id'])]
        layer_amount = np.random.uniform(10000, 100000)
        for j in range(len(chain) - 1):
            cls_tx = pd.DataFrame({
                'sender_id': [chain[j]],
                'receiver_id': [chain[j+1]],
                'amount': [layer_amount],
                'timestamp': [ts + j],
                'ml_flag': 1,
                'is_cross_border': np.random.choice([True, False], 1, p=[0.8, 0.2]),
                'high_risk_jurisdiction': np.random.choice([True, False], 1, p=[0.6, 0.4]),
                'flagged_receiver': np.random.choice([True, False], 1, p=[0.4, 0.6]),
                'typology': ['cls']
            })
            transactions = pd.concat([transactions, cls_tx], ignore_index=True)

    # Step 3: Refine realism (add noise to 10% benign, adjust for Phase 2's real-world focus)
    noise_idx = np.random.choice(transactions[transactions['typology'] == 'benign'].index, int(0.1 * num_transactions))
    transactions.loc[noise_idx, 'amount'] *= np.random.uniform(0.8, 1.2)
    transactions.loc[noise_idx, 'is_cross_border'] = np.random.choice([True, False], len(noise_idx), p=[0.4, 0.6])
    # Additional noise for high-risk variability (inspired by 2025 updates on cross-border integrity)
    high_risk_noise = transactions[transactions['high_risk_jurisdiction']]
    if not high_risk_noise.empty:
        high_risk_noise_idx = np.random.choice(high_risk_noise.index, int(0.2 * len(high_risk_noise)))
        transactions.loc[high_risk_noise_idx, 'flagged_receiver'] = True

    entities.to_csv('entities.csv', index=False)
    transactions.to_csv('transactions.csv', index=False)
    print(f"Generated {num_entities} entities and {len(transactions)} transactions (after typologies)")
    print(transactions['typology'].value_counts())
    print(transactions['amount'].describe())
    print(transactions[['is_cross_border', 'ml_flag', 'flagged_receiver', 'high_risk_jurisdiction']].value_counts())
    print(entities[['kyc_risk_score', 'dormancy_period']].describe())
    print(f"Proportion of ml_flag = 1: {transactions['ml_flag'].mean():.4f}")
    return entities, transactions

# Call the function to generate data
entities, transactions = generate_synthetic_data()

Cell 3: Graph Convolutional Network

In [None]:
# Cell 3a


# Load data (if not already in memory from Cell 2)
entities = pd.read_csv('entities.csv')
transactions = pd.read_csv('transactions.csv')

# Build directed graph (since transactions have direction)
G = nx.DiGraph()
# Add nodes with attributes
for _, row in entities.iterrows():
    G.add_node(row['entity_id'], **row.to_dict())
# Add edges with attributes
for _, row in transactions.iterrows():
    G.add_edge(row['sender_id'], row['receiver_id'], **row.to_dict())

# Verify graph
num_nodes = G.number_of_nodes()
num_edges = G.number_of_edges()
print(f"Graph built with {num_nodes} nodes and {num_edges} edges")
if num_nodes == 0 or num_edges == 0:
    raise ValueError("Graph is empty.")

# Feature extraction (replacing Kuzu queries)

# Degree (out-degree, since directed)
degree_dict = dict(G.out_degree())
degree_df = pd.DataFrame({'entity_id': list(degree_dict.keys()), 'degree': list(degree_dict.values())}).fillna(0)

# In-degree (for later features)
in_degree_dict = dict(G.in_degree())
in_degree_df = pd.DataFrame({'entity_id': list(in_degree_dict.keys()), 'in_degree': list(in_degree_dict.values())}).fillna(0)

# Small transaction count (<1000)
small_tx = {}
for node in G.nodes():
    small_tx[node] = sum(1 for _, _, data in G.out_edges(node, data=True) if data.get('amount', 0) < 1000)
small_tx_df = pd.DataFrame({'entity_id': list(small_tx.keys()), 'small_tx_count': list(small_tx.values())}).fillna(0)

# Clustering coefficient (using undirected version for simplicity, as in your query)
undirected_G = G.to_undirected()
clustering_dict = nx.clustering(undirected_G)
cluster_df = pd.DataFrame({'entity_id': list(clustering_dict.keys()), 'clustering_coeff': list(clustering_dict.values())}).fillna(0)

# Transaction frequency variance (based on out-edges count)
tx_count = {node: G.out_degree(node) for node in G.nodes()}
tx_count_df = pd.DataFrame({'entity_id': list(tx_count.keys()), 'tx_count': list(tx_count.values())}).fillna(0)
mean_tx_count = tx_count_df['tx_count'].mean()
tx_count_df['tx_freq_variance'] = np.sqrt((tx_count_df['tx_count'] - mean_tx_count) ** 2).fillna(0)
tx_freq_df = tx_count_df[['entity_id', 'tx_freq_variance']]

# Amount variance, skewness, avg, high_value_ratio
amount_data = {}
for node in G.nodes():
    amounts = [data.get('amount', 0) for _, _, data in G.out_edges(node, data=True)]
    if len(amounts) > 0:
        avg = np.mean(amounts)
        var = np.var(amounts, ddof=1) if len(amounts) > 1 else 0
        skew = (((np.array(amounts) - avg) ** 3).mean()) / (np.std(amounts, ddof=1) ** 3 + 1e-7) if len(amounts) > 1 else 0
        high_ratio = sum(a > 100000 for a in amounts) / len(amounts)
    else:
        avg = var = skew = high_ratio = 0
    amount_data[node] = {'avg_amount': avg, 'amount_variance': var, 'amount_skewness': skew, 'high_value_ratio': high_ratio}
var_skew_df = pd.DataFrame.from_dict(amount_data, orient='index').reset_index().rename(columns={'index': 'entity_id'}).fillna(0)

# Temporal features: velocity, burstiness, concentration
time_data = {}
for node in G.nodes():
    timestamps = [data.get('timestamp', 0) for _, _, data in G.out_edges(node, data=True)]
    if len(timestamps) > 0:
        min_t, max_t = min(timestamps), max(timestamps)
        velocity = len(timestamps) / (max_t - min_t + 1e-7)
        burst = np.var(timestamps, ddof=1) * 10 if len(timestamps) > 1 else 0
        conc = 1 if (max_t - min_t < 3) else 0  # Binary as per your mean logic
    else:
        velocity = burst = conc = 0
    time_data[node] = {'tx_velocity': velocity, 'burstiness': burst, 'temporal_concentration': conc}
time_agg_df = pd.DataFrame.from_dict(time_data, orient='index').reset_index().rename(columns={'index': 'entity_id'}).fillna(0)

# Additional features (kyc, dormancy, cross_border_amount, tx_frequency, directionality_ratio)
features = {}
for node in G.nodes():
    node_data = G.nodes[node]
    out_edges = list(G.out_edges(node, data=True))
    in_edges = list(G.in_edges(node, data=True))
    cross_border_amount = sum(data.get('amount', 0) for _, _, data in out_edges if data.get('is_cross_border', False))
    tx_frequency = len(out_edges)
    directionality_ratio = len(in_edges) / (len(in_edges) + tx_frequency + 1e-7)
    features[node] = {
        'kyc_risk_score': node_data.get('kyc_risk_score', 0),
        'dormancy_period': node_data.get('dormancy_period', 0),
        'cross_border_amount': cross_border_amount,
        'in_degree': len(in_edges),
        'tx_frequency': tx_frequency,
        'directionality_ratio': directionality_ratio
    }
features_df = pd.DataFrame.from_dict(features, orient='index').reset_index().rename(columns={'index': 'entity_id'}).fillna(0)

# Round-tripping (count 2-hop cycles back to self)
round_trip = {}
for node in G.nodes():
    count = 0
    for neighbor in G.neighbors(node):
        for next_neighbor in G.neighbors(neighbor):
            if next_neighbor == node and neighbor != node:
                count += 1
    round_trip[node] = count
round_trip_df = pd.DataFrame({'entity_id': list(round_trip.keys()), 'round_trip_count': list(round_trip.values())}).fillna(0)

# Ensure all DFs cover all entities (add missing with defaults)
all_entities = pd.DataFrame({'entity_id': entities['entity_id']})
degree_df = all_entities.merge(degree_df, on='entity_id', how='left').fillna(0)
# Repeat for others...

In [None]:
# Cell 3b

# Merge features from Cell 3
data_df = degree_df.merge(small_tx_df, on='entity_id', how='left')\
                   .merge(var_skew_df, on='entity_id', how='left')\
                   .merge(cluster_df, on='entity_id', how='left')\
                   .merge(tx_freq_df, on='entity_id', how='left')\
                   .merge(time_agg_df, on='entity_id', how='left')\
                   .merge(features_df, on='entity_id', how='left')\
                   .merge(round_trip_df, on='entity_id', how='left')\
                   .merge(in_degree_df, on='entity_id', how='left')

# Fill missing values (e.g., entities with no transactions)
data_df.fillna({
    'degree': 0, 'small_tx_count': 0, 'avg_amount': 0, 'amount_variance': 0, 
    'amount_skewness': 0, 'high_value_ratio': 0, 'clustering_coeff': 0, 
    'tx_freq_variance': 0, 'tx_velocity': 0, 'burstiness': 0, 
    'temporal_concentration': 0, 'kyc_risk_score': 0, 'dormancy_period': 0, 
    'cross_border_amount': 0, 'in_degree': 0, 'tx_frequency': 0, 
    'directionality_ratio': 0, 'round_trip_count': 0
}, inplace=True)

# Compute ml_flag_score for anomaly labeling
ml_flag_dict = {}
for node in G.nodes():
    ml_flags = [data.get('ml_flag', 0) for _, _, data in G.out_edges(node, data=True)]
    ml_flag_dict[node] = np.mean(ml_flags) if ml_flags else 0
ml_flag_df = pd.DataFrame({'entity_id': list(ml_flag_dict.keys()), 'ml_flag_score': list(ml_flag_dict.values())}).fillna(0)

data_df = data_df.merge(ml_flag_df, on='entity_id', how='left').fillna({'ml_flag_score': 0})

# Define anomaly labels using ROC-optimized threshold
data_df['is_anomaly'] = (data_df['ml_flag_score'] > 0).astype(int)
if data_df['is_anomaly'].sum() > 1:  # Ensure enough positives
    fpr, tpr, thresholds = roc_curve(data_df['is_anomaly'], data_df['ml_flag_score'])
    anomaly_threshold = thresholds[np.argmax(tpr - fpr)]  # Youden’s J
    data_df['is_anomaly'] = (data_df['ml_flag_score'] > anomaly_threshold).astype(int)
else:
    print("Warning: Too few positive samples; using 95th percentile threshold")
    anomaly_threshold = np.percentile(data_df['ml_flag_score'], 95)
    data_df['is_anomaly'] = (data_df['ml_flag_score'] > anomaly_threshold).astype(int)

# Validate features
print("Merged data_df shape:", data_df.shape)
print("NaN counts:\n", data_df.isna().sum())
print("Data types:\n", data_df.dtypes)
print("Feature stats:\n", data_df.describe())
if not np.all(np.isfinite(data_df.drop(columns=['entity_id']).select_dtypes(include=[np.number]))):
    print("Warning: Non-finite values detected; replacing with 0")
    data_df.update(data_df.select_dtypes(include=[np.number]).fillna(0))

# Check for leakage (high correlation with label)
correlations = data_df.drop(columns=['entity_id', 'is_anomaly', 'ml_flag_score']).corrwith(data_df['ml_flag_score'])
print("Feature correlations with ml_flag_score:\n", correlations)

# Save for debugging
data_df.to_csv('merged_features.csv', index=False)

Cell 4: Simulation across institutions

In [None]:
# Simulate institutions (e.g., 3 banks)
num_insts = 3
inst_labels = np.random.randint(0, num_insts, len(data_df))
global_model = Autoencoder(input_dim)  # Or your GCN

for inst in range(num_insts):
    inst_idx = np.where(inst_labels == inst)[0]
    inst_train_idx = np.intersect1d(train_idx, inst_idx)
    if len(inst_train_idx) == 0: continue
    local_X = X[inst_train_idx]
    local_normal = local_X[data_df.iloc[inst_train_idx]['is_anomaly'] == 0]
    local_tensor = torch.FloatTensor(local_normal)
    
    local_model = Autoencoder(input_dim)  # Train local
    local_opt = optim.Adam(local_model.parameters(), lr=0.0005)
    for epoch in range(100):  # Shorter local epochs
        outputs = local_model(local_tensor)
        loss = criterion(outputs, local_tensor)
        local_opt.zero_grad()
        loss.backward()
        local_opt.step()
    
    # Average to global (simple FL)
    for global_param, local_param in zip(global_model.parameters(), local_model.parameters()):
        global_param.data = (global_param.data + local_param.data) / 2  # Or weighted

# Use global_model for inference as before