In [1]:
# Install required packages
!pip install torch torch-geometric pandas numpy scikit-learn matplotlib seaborn

Collecting torch-geometric
  Downloading torch_geometric-2.7.0-py3-none-any.whl.metadata (63 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m63.7/63.7 kB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
Downloading torch_geometric-2.7.0-py3-none-any.whl (1.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m21.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: torch-geometric
Successfully installed torch-geometric-2.7.0


In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, roc_curve
import torch
import torch.nn.functional as F
from torch_geometric.data import Data
from torch_geometric.nn import GCNConv, SAGEConv, GATConv
from torch_geometric.utils import from_networkx
import networkx as nx
import warnings
warnings.filterwarnings('ignore')

In [3]:
# ============================================================================
# STEP 1: Load and Explore Data
# ============================================================================

df = pd.read_csv('transactions.csv')


# ============================================================================
# STEP 2: Data Preprocessing and Graph Construction
# ============================================================================

def preprocess_data(df):

    print("DATA PREPROCESSING")

    # Display basic info
    print(f"\nDataset shape: {df.shape}")
    print(f"Columns: {df.columns.tolist()}")
    print(f"\nFirst few rows:")
    print(df.head())

    # Check for fraud distribution
    if 'is_fraud' in df.columns:
        fraud_dist = df['is_fraud'].value_counts()
        print(f"\nFraud Distribution:")
        print(fraud_dist)
        print(f"Fraud Rate: {fraud_dist[1]/len(df)*100:.2f}%")

    # Handle missing values
    df = df.dropna(subset=['from_address', 'to_address'])

    return df

def build_graph(df):
    """
    Build transaction graph from dataframe
    Nodes: Addresses
    Edges: Transactions
    """
    print("\n" + "="*70)
    print("GRAPH CONSTRUCTION")
    print("="*70)

    # Create unique node mapping
    unique_addresses = pd.concat([df['from_address'], df['to_address']]).unique()
    node_to_idx = {addr: idx for idx, addr in enumerate(unique_addresses)}
    idx_to_node = {idx: addr for addr, idx in node_to_idx.items()}

    print(f"\nNumber of unique addresses (nodes): {len(unique_addresses)}")
    print(f"Number of transactions (edges): {len(df)}")

    # Create edge index
    edge_index = []
    edge_attr = []

    for _, row in df.iterrows():
        from_idx = node_to_idx[row['from_address']]
        to_idx = node_to_idx[row['to_address']]

        edge_index.append([from_idx, to_idx])

        # Edge features: amount, gas_used, etc.
        edge_features = []
        if 'amount' in df.columns:
            edge_features.append(float(row['amount']))
        if 'gas_used' in df.columns:
            edge_features.append(float(row['gas_used']))
        if 'gas_price' in df.columns:
            edge_features.append(float(row['gas_price']))

        edge_attr.append(edge_features)

    edge_index = torch.tensor(edge_index, dtype=torch.long).t().contiguous()
    edge_attr = torch.tensor(edge_attr, dtype=torch.float)

    # Create node features
    node_features = create_node_features(df, node_to_idx)

    # Create node labels (fraud or not)
    node_labels = create_node_labels(df, node_to_idx)

    return edge_index, edge_attr, node_features, node_labels, node_to_idx, idx_to_node

def create_node_features(df, node_to_idx):
    """
    Create node features based on transaction patterns
    """
    print("\nCreating node features...")

    num_nodes = len(node_to_idx)
    features = []

    for addr, idx in node_to_idx.items():
        # Outgoing transactions
        out_txs = df[df['from_address'] == addr]
        in_txs = df[df['to_address'] == addr]

        feature_vector = [
            len(out_txs),  # Number of outgoing transactions
            len(in_txs),   # Number of incoming transactions
            out_txs['amount'].sum() if 'amount' in df.columns and len(out_txs) > 0 else 0,
            in_txs['amount'].sum() if 'amount' in df.columns and len(in_txs) > 0 else 0,
            out_txs['amount'].mean() if 'amount' in df.columns and len(out_txs) > 0 else 0,
            in_txs['amount'].mean() if 'amount' in df.columns and len(in_txs) > 0 else 0,
            out_txs['gas_used'].mean() if 'gas_used' in df.columns and len(out_txs) > 0 else 0,
            in_txs['gas_used'].mean() if 'gas_used' in df.columns and len(in_txs) > 0 else 0,
        ]
        features.append(feature_vector)

    features = np.array(features)

    # Normalize features
    scaler = StandardScaler()
    features = scaler.fit_transform(features)

    print(f"Node feature shape: {features.shape}")

    return torch.tensor(features, dtype=torch.float)

def create_node_labels(df, node_to_idx):
    """
    Create node labels (1 if involved in fraud, 0 otherwise)
    """
    print("Creating node labels...")

    labels = np.zeros(len(node_to_idx))

    if 'is_fraud' in df.columns:
        fraud_txs = df[df['is_fraud'] == 1]

        for _, row in fraud_txs.iterrows():
            from_idx = node_to_idx[row['from_address']]
            to_idx = node_to_idx[row['to_address']]
            labels[from_idx] = 1
            labels[to_idx] = 1

        print(f"Fraudulent nodes: {labels.sum()}/{len(labels)} ({labels.sum()/len(labels)*100:.2f}%)")

    return torch.tensor(labels, dtype=torch.long)

In [4]:
# ============================================================================
# STEP 3: GNN Model Definition
# ============================================================================

class GCN_FraudDetector(torch.nn.Module):
    """
    Graph Convolutional Network for Fraud Detection
    """
    def __init__(self, num_features, hidden_channels):
        super(GCN_FraudDetector, self).__init__()
        self.conv1 = GCNConv(num_features, hidden_channels)
        self.conv2 = GCNConv(hidden_channels, hidden_channels)
        self.conv3 = GCNConv(hidden_channels, hidden_channels // 2)
        self.linear = torch.nn.Linear(hidden_channels // 2, 2)
        self.dropout = torch.nn.Dropout(0.5)

    def forward(self, x, edge_index):
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = self.dropout(x)

        x = self.conv2(x, edge_index)
        x = F.relu(x)
        x = self.dropout(x)

        x = self.conv3(x, edge_index)
        x = F.relu(x)

        x = self.linear(x)
        return F.log_softmax(x, dim=1)

class GraphSAGE_FraudDetector(torch.nn.Module):
    """
    GraphSAGE for Fraud Detection
    """
    def __init__(self, num_features, hidden_channels):
        super(GraphSAGE_FraudDetector, self).__init__()
        self.conv1 = SAGEConv(num_features, hidden_channels)
        self.conv2 = SAGEConv(hidden_channels, hidden_channels)
        self.conv3 = SAGEConv(hidden_channels, hidden_channels // 2)
        self.linear = torch.nn.Linear(hidden_channels // 2, 2)
        self.dropout = torch.nn.Dropout(0.5)

    def forward(self, x, edge_index):
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = self.dropout(x)

        x = self.conv2(x, edge_index)
        x = F.relu(x)
        x = self.dropout(x)

        x = self.conv3(x, edge_index)
        x = F.relu(x)

        x = self.linear(x)
        return F.log_softmax(x, dim=1)

class GAT_FraudDetector(torch.nn.Module):
    """
    Graph Attention Network for Fraud Detection
    """
    def __init__(self, num_features, hidden_channels):
        super(GAT_FraudDetector, self).__init__()
        self.conv1 = GATConv(num_features, hidden_channels, heads=8, dropout=0.6)
        self.conv2 = GATConv(hidden_channels * 8, hidden_channels, heads=1, concat=False, dropout=0.6)
        self.linear = torch.nn.Linear(hidden_channels, 2)

    def forward(self, x, edge_index):
        x = F.dropout(x, p=0.6, training=self.training)
        x = self.conv1(x, edge_index)
        x = F.elu(x)
        x = F.dropout(x, p=0.6, training=self.training)
        x = self.conv2(x, edge_index)
        x = self.linear(x)
        return F.log_softmax(x, dim=1)

In [None]:
# ============================================================================
# STEP 4: Training and Evaluation
# ============================================================================

def train_model(model, data, train_mask, optimizer, criterion):
    """
    Train the model for one epoch
    """
    model.train()
    optimizer.zero_grad()
    out = model(data.x, data.edge_index)
    loss = criterion(out[train_mask], data.y[train_mask])
    loss.backward()
    optimizer.step()
    return loss.item()

def evaluate_model(model, data, mask):
    """
    Evaluate the model
    """
    model.eval()
    with torch.no_grad():
        out = model(data.x, data.edge_index)
        pred = out.argmax(dim=1)
        correct = (pred[mask] == data.y[mask]).sum()
        acc = int(correct) / int(mask.sum())
    return acc, pred

def train_and_evaluate(model_class, model_name, data, epochs=200, lr=0.01, hidden_channels=64):
    """
    Complete training and evaluation pipeline
    """
    print("\n" + "="*70)
    print(f"TRAINING {model_name}")
    print("="*70)

    # Initialize model
    model = model_class(data.num_features, hidden_channels)
    optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=5e-4)
    criterion = torch.nn.NLLLoss()

    # Training loop
    train_losses = []
    val_accs = []

    for epoch in range(epochs):
        loss = train_model(model, data, data.train_mask, optimizer, criterion)
        train_losses.append(loss)

        if epoch % 20 == 0:
            train_acc, _ = evaluate_model(model, data, data.train_mask)
            val_acc, _ = evaluate_model(model, data, data.val_mask)
            val_accs.append(val_acc)
            print(f'Epoch {epoch:03d}, Loss: {loss:.4f}, Train Acc: {train_acc:.4f}, Val Acc: {val_acc:.4f}')

    # Final evaluation
    test_acc, test_pred = evaluate_model(model, data, data.test_mask)
    print(f'\n{model_name} Test Accuracy: {test_acc:.4f}')

    # Detailed metrics
    y_true = data.y[data.test_mask].cpu().numpy()
    y_pred = test_pred[data.test_mask].cpu().numpy()

    print("\nClassification Report:")
    print(classification_report(y_true, y_pred, target_names=['Normal', 'Fraud']))

    return model, train_losses, val_accs, y_true, y_pred

In [5]:
# ============================================================================
# STEP 5: Visualization
# ============================================================================

def plot_results(train_losses, model_name):
    """
    Plot training curves
    """
    plt.figure(figsize=(10, 5))
    plt.plot(train_losses)
    plt.title(f'{model_name} - Training Loss')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.grid(True)
    plt.show()

def plot_confusion_matrix(y_true, y_pred, model_name):
    """
    Plot confusion matrix
    """
    cm = confusion_matrix(y_true, y_pred)
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
    plt.title(f'{model_name} - Confusion Matrix')
    plt.ylabel('True Label')
    plt.xlabel('Predicted Label')
    plt.show()

def compare_models(results):
    """
    Compare multiple models
    """
    models = list(results.keys())
    accuracies = [results[m]['test_acc'] for m in models]

    plt.figure(figsize=(10, 6))
    plt.bar(models, accuracies, color=['#1f77b4', '#ff7f0e', '#2ca02c'])
    plt.title('Model Comparison - Test Accuracy')
    plt.ylabel('Accuracy')
    plt.ylim([0, 1])
    for i, v in enumerate(accuracies):
        plt.text(i, v + 0.02, f'{v:.4f}', ha='center', fontweight='bold')
    plt.show()

In [9]:
# ============================================================================
# MAIN EXECUTION
# ============================================================================
print("GNN-BASED FRAUD DETECTION SYSTEM")

# STEP 0: First, explore your data to understand its structure

# Load your data
df = pd.read_csv('transactions.csv')

# Preprocess
df = preprocess_data(df)

# Build graph
edge_index, edge_attr, node_features, node_labels, node_to_idx, idx_to_node = build_graph(df)

# Create PyTorch Geometric Data object
data = Data(x=node_features, edge_index=edge_index, y=node_labels)

# Create train/val/test masks
num_nodes = len(node_labels)
indices = np.arange(num_nodes)
train_idx, temp_idx = train_test_split(indices, train_size=0.7, stratify=node_labels.numpy())
val_idx, test_idx = train_test_split(temp_idx, train_size=0.5, stratify=node_labels[temp_idx].numpy())

data.train_mask = torch.zeros(num_nodes, dtype=torch.bool)
data.val_mask = torch.zeros(num_nodes, dtype=torch.bool)
data.test_mask = torch.zeros(num_nodes, dtype=torch.bool)

data.train_mask[train_idx] = True
data.val_mask[val_idx] = True
data.test_mask[test_idx] = True

print(f"\nData splits - Train: {data.train_mask.sum()}, Val: {data.val_mask.sum()}, Test: {data.test_mask.sum()}")

# Train multiple models
results = {}

# GCN
model_gcn, losses_gcn, _, y_true_gcn, y_pred_gcn = train_and_evaluate(
    GCN_FraudDetector, "GCN", data, epochs=200
)
results['GCN'] = {'test_acc': (y_true_gcn == y_pred_gcn).mean()}
plot_results(losses_gcn, "GCN")
plot_confusion_matrix(y_true_gcn, y_pred_gcn, "GCN")

# GraphSAGE
model_sage, losses_sage, _, y_true_sage, y_pred_sage = train_and_evaluate(
    GraphSAGE_FraudDetector, "GraphSAGE", data, epochs=200
)
results['GraphSAGE'] = {'test_acc': (y_true_sage == y_pred_sage).mean()}
plot_results(losses_sage, "GraphSAGE")
plot_confusion_matrix(y_true_sage, y_pred_sage, "GraphSAGE")

# GAT
model_gat, losses_gat, _, y_true_gat, y_pred_gat = train_and_evaluate(
    GAT_FraudDetector, "GAT", data, epochs=200
)
results['GAT'] = {'test_acc': (y_true_gat == y_pred_gat).mean()}
plot_results(losses_gat, "GAT")
plot_confusion_matrix(y_true_gat, y_pred_gat, "GAT")

# Compare all models
compare_models(results)

# Save best model
best_model_name = max(results, key=lambda x: results[x]['test_acc'])
print(f"\nBest Model: {best_model_name}")

GNN-BASED FRAUD DETECTION SYSTEM
DATA PREPROCESSING

Dataset shape: (485576, 9)
Columns: ['tx_hash', 'from_wallet', 'to_wallet', 'token', 'amount', 'timestamp', 'gas_fee_usd', 'platform', 'tx_type']

First few rows:
                                             tx_hash  \
0  0xc40e0b25086f94703e134ffc6cab543321f7610c0302...   
1  0x2e32672b7ccb2c55db06c91ca17122dea7c1c6f7f848...   
2  0x3d2cde2b23d28e641a6d5f2e013f0507213c00ca34ef...   
3  0x21c59e7a4f0fc4b1d7b5675e6c8416d0ac0f6b85ed8b...   
4  0x37a1cc6eab2bf725091ea66a44625e34c76eb005f5dc...   

                                  from_wallet  \
0  0xd99d937bfcbe7f83bc761bf0fe3316fdfde27202   
1  0x20ad73be3c5ec493f2033ff9bc7d60fb5d4cdb95   
2  0x52e3343a7b26c07e6dd7f3c16b3a79cf496123b7   
3  0x93bc0e6b2790e4ddd7c3c8779e218ed2b3ecee40   
4  0x4ebcc8d240d2e3be0bc39c5485ea707fec7007d0   

                                    to_wallet token     amount  \
0  0x3d3d6ad4dc669c598c31454a3f73410da25922fe   DAI   4.466987   
1  0x3d6f42ea21ae34d

KeyError: ['from_address', 'to_address']