<a href="https://colab.research.google.com/github/alamindhaly/ML_Research_Resorces/blob/main/Anti_Money_Laundering.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip -q install pandas numpy scikit-learn torch torch-geometric

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m63.7/63.7 kB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m19.7 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
import pandas as pd
import numpy as np
import hashlib
import json

from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

import torch
from torch import nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, random_split

from torch_geometric.data import Data as GeoData
from torch_geometric.nn import GATConv

from google.colab import drive

In [None]:
# Basic training settings
SEQ_LEN = 10          # length of transaction sequence per account
BATCH_SIZE = 256
EPOCHS = 5            # keep small for quick runs; increase later
LR = 1e-3
TEST_SIZE = 0.2
RANDOM_STATE = 42

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", DEVICE)

torch.manual_seed(RANDOM_STATE)
np.random.seed(RANDOM_STATE)

Using device: cpu


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
CSV_PATH = "/content/drive/MyDrive/Research Folder (Al-Amin Dhaly)/SAML-D.csv"
print("Using CSV:", CSV_PATH)

Using CSV: /content/drive/MyDrive/Research Folder (Al-Amin Dhaly)/SAML-D.csv


In [None]:
df = pd.read_csv(CSV_PATH)

In [None]:
# Combine Date and Time into a single datetime column
df["Datetime"] = pd.to_datetime(df["Date"] + " " + df["Time"], dayfirst=True)
df = df.sort_values("Datetime").reset_index(drop=True)

# Convert datetime to numeric seconds (for time-based feature)
df["Timestamp"] = (df["Datetime"] - df["Datetime"].min()).dt.total_seconds()

# Map account IDs to integer node IDs for graph
accounts = pd.Index(df["Sender_account"]).append(pd.Index(df["Receiver_account"])).unique()
acc_to_idx = {acc: i for i, acc in enumerate(accounts)}
df["sender_idx"] = df["Sender_account"].map(acc_to_idx)
df["receiver_idx"] = df["Receiver_account"].map(acc_to_idx)

# Binary label: 1 = laundering, 0 = normal
df["label"] = df["Is_laundering"].astype(np.float32)

# Choose feature columns
cat_cols = [
    "Payment_currency",
    "Received_currency",
    "Sender_bank_location",
    "Receiver_bank_location",
    "Payment_type",
]
num_cols = ["Amount", "Timestamp"]

print("Encoding transaction features...")
# One-hot encode categorical features
df_cat = pd.get_dummies(df[cat_cols], drop_first=False)
txn_features_df = pd.concat([df[num_cols], df_cat], axis=1)

# Scale transaction features
scaler_txn = StandardScaler()
txn_features = scaler_txn.fit_transform(txn_features_df.values).astype(np.float32)
feature_dim = txn_features.shape[1]

labels = df["label"].values.astype(np.float32)
sender_idx_arr = df["sender_idx"].values.astype(np.int64)
receiver_idx_arr = df["receiver_idx"].values.astype(np.int64)

print("Transaction feature dimension:", feature_dim)
print("Total transactions:", len(df))

Encoding transaction features...
Transaction feature dimension: 71
Total transactions: 1048575


In [None]:
#Build graph data for the GAT branch

num_nodes = len(accounts)

# Simple node features: outgoing count/sum, incoming count/sum
node_feats = np.zeros((num_nodes, 4), dtype=np.float32)

out_stats = df.groupby("sender_idx")["Amount"].agg(["count", "sum"])
in_stats = df.groupby("receiver_idx")["Amount"].agg(["count", "sum"])

node_feats[out_stats.index, 0] = out_stats["count"].values
node_feats[out_stats.index, 1] = out_stats["sum"].values
node_feats[in_stats.index, 2] = in_stats["count"].values
node_feats[in_stats.index, 3] = in_stats["sum"].values

# Scale node features
scaler_node = StandardScaler()
node_feats = scaler_node.fit_transform(node_feats).astype(np.float32)

# Edges: sender -> receiver
edge_index = torch.tensor(
    np.vstack([sender_idx_arr, receiver_idx_arr]),
    dtype=torch.long,
)

node_x = torch.tensor(node_feats, dtype=torch.float32)

graph_data = GeoData(x=node_x, edge_index=edge_index)

print("Number of nodes:", node_x.shape[0])
print("Number of edges:", edge_index.shape[1])

# Move graph tensors to GPU/CPU device once
node_x_gpu = graph_data.x.to(DEVICE)
edge_index_gpu = graph_data.edge_index.to(DEVICE)

Number of nodes: 325888
Number of edges: 1048575


In [None]:
# Build sequence dataset for temporal branch

class AMLSequenceDataset(Dataset):
    """
    Each sample:
      - a sequence of SEQ_LEN past transactions for one sender
      - sender index, receiver index of last transaction
      - label of last transaction
    """

    def __init__(self, df, txn_features, seq_len=10, max_samples_per_account=200):
        self.seq_len = seq_len
        self.txn_features = txn_features
        self.labels = df["label"].values.astype(np.float32)
        self.sender_idx = df["sender_idx"].values.astype(np.int64)
        self.receiver_idx = df["receiver_idx"].values.astype(np.int64)

        self.samples = []

        # Group transactions by sender account (in time order)
        grouped = df.sort_values("Datetime").groupby("sender_idx").indices
        rng = np.random.default_rng(RANDOM_STATE)

        for sender, idxs in grouped.items():
            idxs = np.sort(idxs)
            if len(idxs) < seq_len:
                continue

            windows = []
            # Sliding window of length seq_len
            for i in range(len(idxs) - seq_len + 1):
                window = idxs[i : i + seq_len]
                target_idx = idxs[i + seq_len - 1]
                windows.append((window, target_idx))

            # Limit huge accounts to a max number of samples
            if len(windows) > max_samples_per_account:
                chosen = rng.choice(len(windows), size=max_samples_per_account, replace=False)
                windows = [windows[i] for i in chosen]

            self.samples.extend(windows)

        print(f"Total sequence samples: {len(self.samples)}")

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        window, target_idx = self.samples[idx]
        # Sequence of features
        seq_x = self.txn_features[window]  # (seq_len, feature_dim)
        # Last transaction info
        y = self.labels[target_idx]
        s = self.sender_idx[target_idx]
        r = self.receiver_idx[target_idx]

        return (
            torch.tensor(seq_x, dtype=torch.float32),
            torch.tensor(s, dtype=torch.long),
            torch.tensor(r, dtype=torch.long),
            torch.tensor(y, dtype=torch.float32),
        )

print("Creating temporal sequence dataset...")
full_dataset = AMLSequenceDataset(df, txn_features, seq_len=SEQ_LEN)

# Train/test split on sequences
train_size = int((1 - TEST_SIZE) * len(full_dataset))
test_size = len(full_dataset) - train_size

train_dataset, test_dataset = random_split(
    full_dataset,
    [train_size, test_size],
    generator=torch.Generator().manual_seed(RANDOM_STATE),
)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, drop_last=True)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)

print("Train samples:", len(train_dataset))
print("Test samples :", len(test_dataset))

Creating temporal sequence dataset...
Total sequence samples: 619077
Train samples: 495261
Test samples : 123816


In [None]:
#Simple blockchain simulation for AML alerts

class SimpleBlockchainAML:
    """
    Very small blockchain-style log for high-risk alerts.
    Each block stores a hash of alert info + previous hash.
    """

    def __init__(self):
        self.chain = []
        self._create_genesis_block()

    def _create_genesis_block(self):
        block = {
            "index": 0,
            "prev_hash": "0" * 64,
            "data": "Genesis Block",
        }
        block["hash"] = self._hash_block(block)
        self.chain.append(block)

    def _hash_block(self, block_data):
        # Hash the block content to create a unique fingerprint
        block_str = json.dumps(block_data, sort_keys=True).encode("utf-8")
        return hashlib.sha256(block_str).hexdigest()

    def add_alert(self, tx_info, risk_score):
        prev_block = self.chain[-1]
        block = {
            "index": len(self.chain),
            "prev_hash": prev_block["hash"],
            "data": {
                "tx_info": tx_info,
                "risk_score": float(risk_score),
            },
        }
        block["hash"] = self._hash_block(block)
        self.chain.append(block)

    def __len__(self):
        return len(self.chain)

    def tail(self, n=5):
        # Show last n blocks
        return self.chain[-n:]


blockchain = SimpleBlockchainAML()
print("Blockchain initialized. Length:", len(blockchain))

Blockchain initialized. Length: 1


In [None]:
# Define the hybrid Transformer + GAT model

class PositionalEncoding(nn.Module):

    #Adds position information to sequence embeddings.


    def __init__(self, d_model, max_len=5000):
        super().__init__()
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float32).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-np.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0)  # shape: (1, max_len, d_model)
        self.register_buffer("pe", pe)

    def forward(self, x):
        seq_len = x.size(1)
        return x + self.pe[:, :seq_len]


class HybridAMLModel(nn.Module):
    """
    Temporal branch: Transformer on transaction sequences.
    Graph branch: GAT on account graph.
    Outputs probability of money laundering.
    """

    def __init__(
        self,
        txn_input_dim,
        node_input_dim,
        d_model=64,
        nhead=4,
        num_layers=1,      # 1 layer to keep things simple
        gat_hidden=32,     # smaller hidden size
        fusion_hidden=64,
    ):
        super().__init__()

        # Project transaction features to model dimension
        self.input_proj = nn.Linear(txn_input_dim, d_model)

        encoder_layer = nn.TransformerEncoderLayer(
            d_model=d_model,
            nhead=nhead,
            batch_first=True,
        )
        self.transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)
        self.pos_encoder = PositionalEncoding(d_model)

        # GAT layers (graph branch)
        self.gat1 = GATConv(node_input_dim, gat_hidden, heads=2, concat=False)

        # Fusion: combine temporal + sender + receiver vectors
        self.fusion = nn.Sequential(
            nn.Linear(d_model + gat_hidden * 2, fusion_hidden),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(fusion_hidden, 1),
        )

    def forward(self, seq_x, sender_idx, receiver_idx, node_x, edge_index):
        # Temporal branch
        h = self.input_proj(seq_x)
        h = self.pos_encoder(h)
        h = self.transformer_encoder(h)
        temporal_emb = h[:, -1, :]  # representation of last time step

        # Graph branch
        g = F.elu(self.gat1(node_x, edge_index))

        sender_emb = g[sender_idx]
        receiver_emb = g[receiver_idx]
        graph_emb = torch.cat([sender_emb, receiver_emb], dim=1)

        # Fuse temporal + graph features
        fused = torch.cat([temporal_emb, graph_emb], dim=1)
        logits = self.fusion(fused).squeeze(1)  # raw scores
        return logits


model = HybridAMLModel(
    txn_input_dim=feature_dim,
    node_input_dim=node_x.shape[1],
    d_model=64,
    nhead=4,
    num_layers=1,
    gat_hidden=32,
    fusion_hidden=64,
).to(DEVICE)

criterion = nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=LR)

print(model)

HybridAMLModel(
  (input_proj): Linear(in_features=71, out_features=64, bias=True)
  (transformer_encoder): TransformerEncoder(
    (layers): ModuleList(
      (0): TransformerEncoderLayer(
        (self_attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=64, out_features=64, bias=True)
        )
        (linear1): Linear(in_features=64, out_features=2048, bias=True)
        (dropout): Dropout(p=0.1, inplace=False)
        (linear2): Linear(in_features=2048, out_features=64, bias=True)
        (norm1): LayerNorm((64,), eps=1e-05, elementwise_affine=True)
        (norm2): LayerNorm((64,), eps=1e-05, elementwise_affine=True)
        (dropout1): Dropout(p=0.1, inplace=False)
        (dropout2): Dropout(p=0.1, inplace=False)
      )
    )
  )
  (pos_encoder): PositionalEncoding()
  (gat1): GATConv(4, 32, heads=2)
  (fusion): Sequential(
    (0): Linear(in_features=128, out_features=64, bias=True)
    (1): ReLU()
    (2): Dropout(p=0.3, inplace=False

In [None]:
#Training and evaluation functions
def train_one_epoch(epoch):
    model.train()
    total_loss = 0.0

    for seq_x, s_idx, r_idx, y in train_loader:
        seq_x = seq_x.to(DEVICE)
        s_idx = s_idx.to(DEVICE)
        r_idx = r_idx.to(DEVICE)
        y = y.to(DEVICE)

        optimizer.zero_grad()
        logits = model(seq_x, s_idx, r_idx, node_x_gpu, edge_index_gpu)
        loss = criterion(logits, y)
        loss.backward()
        optimizer.step()

        total_loss += loss.item() * seq_x.size(0)

    avg_loss = total_loss / len(train_loader.dataset)
    print(f"Epoch {epoch:02d} - Train Loss: {avg_loss:.4f}")


def evaluate(blockchain_obj=None, alert_threshold=0.9):
    model.eval()
    all_labels = []
    all_probs = []

    with torch.no_grad():
        for seq_x, s_idx, r_idx, y in test_loader:
            seq_x = seq_x.to(DEVICE)
            s_idx = s_idx.to(DEVICE)
            r_idx = r_idx.to(DEVICE)
            y = y.to(DEVICE)

            logits = model(seq_x, s_idx, r_idx, node_x_gpu, edge_index_gpu)
            probs = torch.sigmoid(logits)

            # log high-risk predictions to blockchain
            if blockchain_obj is not None:
                probs_cpu = probs.cpu().numpy()
                s_cpu = s_idx.cpu().numpy()
                r_cpu = r_idx.cpu().numpy()
                y_cpu = y.cpu().numpy()

                for i in range(len(probs_cpu)):
                    risk = probs_cpu[i]
                    if risk >= alert_threshold:
                        tx_info = {
                            "sender_node": int(s_cpu[i]),
                            "receiver_node": int(r_cpu[i]),
                            "true_label": int(y_cpu[i]),
                        }
                        blockchain_obj.add_alert(tx_info, risk_score=risk)

            all_labels.append(y.cpu().numpy())
            all_probs.append(probs.cpu().numpy())

    y_true = np.concatenate(all_labels)
    y_prob = np.concatenate(all_probs)
    y_pred = (y_prob >= 0.5).astype(np.float32)

    acc = accuracy_score(y_true, y_pred)
    prec = precision_score(y_true, y_pred, zero_division=0)
    rec = recall_score(y_true, y_pred, zero_division=0)
    f1 = f1_score(y_true, y_pred, zero_division=0)

    try:
        auc = roc_auc_score(y_true, y_prob)
    except ValueError:
        auc = float("nan")

    print(f"Test Accuracy : {acc:.4f}")
    print(f"Test Precision: {prec:.4f}")
    print(f"Test Recall   : {rec:.4f}")
    print(f"Test F1       : {f1:.4f}")
    print(f"Test AUC      : {auc:.4f}")

    return acc, prec, rec, f1, auc

In [None]:
print("Starting training...")

for epoch in range(1, EPOCHS + 1):
    print("\n" + "-" * 60)
    # Show which epoch and overall percentage
    pct_done = 100.0 * epoch / EPOCHS
    print(f"Epoch {epoch}/{EPOCHS}  ({pct_done:.1f}% completed)")

    # Train for one epoch
    train_one_epoch(epoch)

    # Evaluate after every epoch
    print("Evaluating on test set...")
    acc, prec, rec, f1, auc = evaluate(
        blockchain_obj=blockchain,
        alert_threshold=0.9
    )

    #Short summary line for that epoch
    print(
        f"Epoch {epoch:02d} - "
        f"Acc: {acc:.4f}, Prec: {prec:.4f}, "
        f"Rec: {rec:.4f}, F1: {f1:.4f}, AUC: {auc:.4f}"
    )

print("\nTraining finished.")
print("Blockchain length (blocks):", len(blockchain))
print("Last few blockchain blocks:")
for block in blockchain.tail(5):
    print(block)

Starting training...

------------------------------------------------------------
Epoch 1/5  (20.0% completed)
Epoch 01 - Train Loss: 0.0060
Evaluating on test set...
Test Accuracy : 0.9996
Test Precision: 0.0000
Test Recall   : 0.0000
Test F1       : 0.0000
Test AUC      : 0.8834
Epoch 01 - Acc: 0.9996, Prec: 0.0000, Rec: 0.0000, F1: 0.0000, AUC: 0.8834

------------------------------------------------------------
Epoch 2/5  (40.0% completed)
Epoch 02 - Train Loss: 0.0036
Evaluating on test set...
Test Accuracy : 0.9996
Test Precision: 0.0000
Test Recall   : 0.0000
Test F1       : 0.0000
Test AUC      : 0.8903
Epoch 02 - Acc: 0.9996, Prec: 0.0000, Rec: 0.0000, F1: 0.0000, AUC: 0.8903

------------------------------------------------------------
Epoch 3/5  (60.0% completed)
Epoch 03 - Train Loss: 0.0031
Evaluating on test set...
Test Accuracy : 0.9997
Test Precision: 1.0000
Test Recall   : 0.1702
Test F1       : 0.2909
Test AUC      : 0.8895
Epoch 03 - Acc: 0.9997, Prec: 1.0000, Rec: 

In [None]:
#Save trained model to Drive
save_path = "/content/drive/MyDrive/Research Folder (Al-Amin Dhaly)/hybrid_aml_saml_d.pt"
torch.save(model.state_dict(), save_path)
print("Model saved to:", save_path)

Model saved to: /content/drive/MyDrive/Research Folder (Al-Amin Dhaly)/hybrid_aml_saml_d.pt


In [None]:
"""https://pytorch.org/tutorials/beginner/nlp/sequence_models_tutorial.html"""