In [10]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_curve, auc

#### Ensure all data have been scaled.

Synthetic data to test if code runs correctly

In [25]:
from sklearn.datasets import make_classification
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
import numpy as np


# Generate synthetic data
X, y = make_classification(n_samples=1000, n_features=20, n_informative=15, n_redundant=5, random_state=42)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale the input data
scaler = MinMaxScaler((0,1))
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

### Logistic Regression

In [26]:
# All input data must be scaled
def lr_train(X_train, y_train):
    model = LogisticRegression()
    model.fit(X_train, y_train)

    return model

def test_model(model, X_test, y_test):
    y_predicted = model.predict(X_test)

    accuracy_result = accuracy_score(y_test, y_predicted)
    confusion_matrix_result = confusion_matrix(y_test, y_predicted)
    classification_report_result = classification_report(y_test, y_predicted)

    return accuracy_result, confusion_matrix_result, classification_report_result

In [27]:
model_LR = lr_train(X_train, y_train)
test_model(model_LR, X_test, y_test)

(0.835,
 array([[88, 18],
        [15, 79]]),
 '              precision    recall  f1-score   support\n\n           0       0.85      0.83      0.84       106\n           1       0.81      0.84      0.83        94\n\n    accuracy                           0.83       200\n   macro avg       0.83      0.84      0.83       200\nweighted avg       0.84      0.83      0.84       200\n')

### Naive Bayes (MultinomialNB)

In [28]:
# All input data must be scaled
def mnb_train(X_train, y_train):
    model = MultinomialNB()
    model.fit(X_train, y_train)

    return model

In [30]:
model_MNB = mnb_train(X_train, y_train)
test_model(model_MNB, X_test, y_test)

(0.74,
 array([[75, 31],
        [21, 73]]),
 '              precision    recall  f1-score   support\n\n           0       0.78      0.71      0.74       106\n           1       0.70      0.78      0.74        94\n\n    accuracy                           0.74       200\n   macro avg       0.74      0.74      0.74       200\nweighted avg       0.74      0.74      0.74       200\n')

### Artificial Neural Network

### Graph ML

In [5]:
import networkx as nx
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch_geometric.data import Data
from torch_geometric.nn import GCNConv, SAGEConv, global_mean_pool
from typing import List, Dict, Tuple
import numpy as np
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

class GraphFeatureExtractor:
    def __init__(self, G: nx.Graph):
        self.G = G
        
    def extract_node_features(self, node_id: str) -> np.array:
        """Extract features for a given node"""
        node = self.G.nodes[node_id]
        features = []
        
        # Node type one-hot encoding
        node_type = node.get('type', '')
        type_encoding = [0, 0, 0, 0]  # Entity, Account, Standalone, Cash
        if node_type == 'Entity': type_encoding[0] = 1
        elif node_type == 'Account': type_encoding[1] = 1
        elif node_type == 'Standalone': type_encoding[2] = 1
        elif node_type == 'Cash Node': type_encoding[3] = 1
        features.extend(type_encoding)
        
        # Add other node attributes
        if node_type in ['Entity', 'Standalone']:
            features.append(1 if node.get('country') else 0)
        else:
            features.append(0)
            
        return np.array(features)
    
    def extract_edge_features(self, edge: Tuple) -> np.array:
        """Extract features for a given edge"""
        edge_data = self.G.edges[edge]
        features = []
        
        # Edge type one-hot encoding
        edge_type = edge_data.get('type', '')
        type_encoding = [0, 0]  # Account Ownership, Transaction
        if edge_type == 'Account Ownership': type_encoding[0] = 1
        elif edge_type == 'Transaction': type_encoding[1] = 1
        features.extend(type_encoding)
        
        # Transaction-specific features
        if edge_type == 'Transaction':
            features.extend([
                edge_data.get('amount', 0),
                edge_data.get('txn time', 0) // 100,  # hour
                edge_data.get('txn time', 0) % 100,   # minute
                edge_data.get('entity_age', 0),
            ])
        else:
            features.extend([0, 0, 0, 0])
            
        return np.array(features)

class GraphRandomForest:
    def __init__(self, n_estimators=100):
        self.rf = RandomForestClassifier(n_estimators=n_estimators)
        self.feature_extractor = None
        
    def prepare_features(self, G: nx.Graph, target_type: str) -> Tuple[np.array, np.array]:
        """Prepare features for either transaction or entity classification"""
        self.feature_extractor = GraphFeatureExtractor(G)
        features = []
        labels = []
        
        if target_type == 'transaction':
            for edge in G.edges():
                if G.edges[edge].get('type') == 'Transaction':
                    # Extract features from connected nodes and edge
                    node1_feat = self.feature_extractor.extract_node_features(edge[0])
                    node2_feat = self.feature_extractor.extract_node_features(edge[1])
                    edge_feat = self.feature_extractor.extract_edge_features(edge)
                    
                    combined_feat = np.concatenate([node1_feat, node2_feat, edge_feat])
                    features.append(combined_feat)
                    labels.append(G.edges[edge].get('suspicious', 0))
                    
        elif target_type == 'entity':
            for node in G.nodes():
                if G.nodes[node].get('type') in ['Entity', 'Standalone']:
                    # Extract node features and aggregate connected transaction features
                    node_feat = self.feature_extractor.extract_node_features(node)
                    
                    # Aggregate transaction features
                    txn_features = []
                    for edge in G.edges(node):
                        if G.edges[edge].get('type') == 'Transaction':
                            txn_features.append(self.feature_extractor.extract_edge_features(edge))
                    
                    if txn_features:
                        txn_agg = np.mean(txn_features, axis=0)
                    else:
                        txn_agg = np.zeros(6)  # Default edge feature size
                        
                    combined_feat = np.concatenate([node_feat, txn_agg])
                    features.append(combined_feat)
                    labels.append(G.nodes[node].get('suspicious', 0))
                    
        return np.array(features), np.array(labels)
    
    def fit(self, G: nx.Graph, target_type: str):
        X, y = self.prepare_features(G, target_type)
        self.rf.fit(X, y)
        
    def predict(self, G: nx.Graph, target_type: str) -> np.array:
        X, _ = self.prepare_features(G, target_type)
        return self.rf.predict(X)

class GraphGradientBoosting:
    def __init__(self, n_estimators=100):
        self.gb = GradientBoostingClassifier(n_estimators=n_estimators)
        self.feature_extractor = None
        
    # Reuse the same feature preparation methods as RandomForest
    prepare_features = GraphRandomForest.prepare_features
        
    def fit(self, G: nx.Graph, target_type: str):
        X, y = self.prepare_features(G, target_type)
        self.gb.fit(X, y)
        
    def predict(self, G: nx.Graph, target_type: str) -> np.array:
        X, _ = self.prepare_features(G, target_type)
        return self.gb.predict(X)

class GraphNeuralNetwork(nn.Module):
    def __init__(self, node_features, edge_features, hidden_channels):
        super().__init__()
        self.conv1 = GCNConv(node_features, hidden_channels)
        self.conv2 = GCNConv(hidden_channels, hidden_channels)
        self.conv3 = SAGEConv(hidden_channels, hidden_channels)
        
        self.node_classifier = nn.Linear(hidden_channels, 1)
        self.edge_classifier = nn.Linear(hidden_channels * 2 + edge_features, 1)
        
    def forward(self, x, edge_index, edge_attr=None, predict_edges=False):
        # Node embedding
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = F.dropout(x, p=0.2, training=self.training)
        
        x = self.conv2(x, edge_index)
        x = F.relu(x)
        x = F.dropout(x, p=0.2, training=self.training)
        
        x = self.conv3(x, edge_index)
        node_embeddings = F.relu(x)
        
        if predict_edges:
            # For edge prediction, combine node embeddings and edge features
            src, dst = edge_index
            edge_features = torch.cat([
                node_embeddings[src],
                node_embeddings[dst],
                edge_attr
            ], dim=1)
            return torch.sigmoid(self.edge_classifier(edge_features))
        else:
            # For node prediction
            return torch.sigmoid(self.node_classifier(node_embeddings))

class GraphGNNWrapper:
    def __init__(self, node_features, edge_features, hidden_channels=64):
        self.model = GraphNeuralNetwork(node_features, edge_features, hidden_channels)
        self.feature_extractor = None
        
    def prepare_pyg_data(self, G: nx.Graph) -> Data:
        """Convert NetworkX graph to PyTorch Geometric Data"""
        self.feature_extractor = GraphFeatureExtractor(G)
        
        # Prepare node features and create node mapping
        node_mapping = {node: idx for idx, node in enumerate(G.nodes())}
        node_features = []
        for node in G.nodes():
            node_features.append(self.feature_extractor.extract_node_features(node))
        
        # Prepare edge indices and features
        edge_index = []
        edge_features = []
        for edge in G.edges():
            # Add edge in both directions for undirected graphs
            edge_index.extend([[node_mapping[edge[0]], node_mapping[edge[1]]],
                             [node_mapping[edge[1]], node_mapping[edge[0]]]])
            edge_feat = self.feature_extractor.extract_edge_features(edge)
            edge_features.extend([edge_feat, edge_feat])
        
        return Data(
            x=torch.FloatTensor(node_features),
            edge_index=torch.LongTensor(edge_index).t().contiguous(),
            edge_attr=torch.FloatTensor(edge_features)
        )
    
    def fit(self, G: nx.Graph, target_type: str, epochs=100):
        data = self.prepare_pyg_data(G)
        optimizer = torch.optim.Adam(self.model.parameters(), lr=0.01)
        
        self.model.train()
        for epoch in range(epochs):
            optimizer.zero_grad()
            
            if target_type == 'transaction':
                # Process edges in the same order as they were added to edge_index
                edge_labels = []
                for i in range(0, len(data.edge_index.t()), 2):  # Step by 2 since we added edges in both directions
                    src = data.edge_index[0][i].item()
                    dst = data.edge_index[1][i].item()
                    # Convert node indices back to original IDs
                    src_id = list(G.nodes())[src]
                    dst_id = list(G.nodes())[dst]
                    edge_data = G.get_edge_data(src_id, dst_id)
                    if edge_data and edge_data.get('type') == 'Transaction':
                        edge_labels.extend([edge_data.get('suspicious', 0)] * 2)  # For both directions
                    else:
                        edge_labels.extend([0] * 2)  # Non-transaction edges
                
                out = self.model(data.x, data.edge_index, data.edge_attr, predict_edges=True)
                labels = torch.FloatTensor(edge_labels)
            else:  # entity
                out = self.model(data.x, data.edge_index)
                # Process nodes in the same order as they appear in the graph
                node_labels = []
                for node_id in G.nodes():
                    if G.nodes[node_id].get('type') in ['Entity', 'Standalone']:
                        node_labels.append(G.nodes[node_id].get('suspicious', 0))
                    else:
                        node_labels.append(0)
                labels = torch.FloatTensor(node_labels)
            
            loss = F.binary_cross_entropy(out.squeeze(), labels)
            loss.backward()
            optimizer.step()
    
    def predict(self, G: nx.Graph, target_type: str) -> np.array:
        data = self.prepare_pyg_data(G)
        self.model.eval()
        
        with torch.no_grad():
            if target_type == 'transaction':
                out = self.model(data.x, data.edge_index, data.edge_attr, predict_edges=True)
            else:  # entity
                out = self.model(data.x, data.edge_index)
                
        return (out.squeeze().numpy() > 0.5).astype(int)

# Example usage:
if __name__ == "__main__":
    # Create sample graph
    G = nx.Graph()
    
    # Add nodes
    G.add_node('E1', type='Entity', country='US')
    G.add_node('A1', type='Account', parent_Entity_Id='E1')
    G.add_node('S1', type='Standalone', country='UK')
    G.add_node('C1', type='Cash Node')
    
    # Add edges with both suspicious and non-suspicious transactions
    G.add_edge('E1', 'A1', type='Account Ownership')
    G.add_edge('E1', 'S1', type='Transaction', amount=1000, txn_time=1430, 
               entity_age=365, transaction_type=1, suspicious=1)
    G.add_edge('A1', 'C1', type='Transaction', amount=500, txn_time=1530,
               entity_age=365, transaction_type=0, suspicious=0)
    
    # Initialize models
    rf_model = GraphRandomForest()
    gb_model = GraphGradientBoosting()
    gnn_model = GraphGNNWrapper(node_features=5, edge_features=6)
    
    # Train models
    target_type = 'transaction'  # or 'entity'
    rf_model.fit(G, target_type)
    gb_model.fit(G, target_type)
    gnn_model.fit(G, target_type)
    
    # Make predictions
    rf_pred = rf_model.predict(G, target_type)
    gb_pred = gb_model.predict(G, target_type)
    gnn_pred = gnn_model.predict(G, target_type)

In [7]:
rf_pred

array([1, 0])

In [8]:
gb_pred

array([1, 0])

In [9]:
gnn_pred

array([0, 0, 0, 0, 0, 0])