# Dataset Splitting

In [1]:
import os
import torch
import numpy as np
from sklearn.model_selection import train_test_split

# Set working directory and define paths for input and output data
work_dir = os.getcwd()  # Use the current directory as work_dir
input_data_dir = os.path.join(work_dir, '../Data')  # Set ../Data as input data location
output_dir = os.path.join(work_dir, '../Data')  # Set ../Data as input data location

# Load the merged graph dataset with labels
merged_file = os.path.join(input_data_dir, 'all_graphs_with_labels-train.pt')
merged_graphs = torch.load(merged_file)

# Extract labels from each graph in the dataset, converting to NumPy array
labels = np.array([graph.y.numpy() if isinstance(graph.y, torch.Tensor) else graph.y for graph in merged_graphs])

# Function to randomly split data into training and testing sets
def random_train_test_split(graphs, labels, test_size=0.3, random_state=42):
    """Split graphs and labels into training and test sets using random split"""
    train_graphs, test_graphs, train_labels, test_labels = train_test_split(
        graphs, labels, test_size=test_size, random_state=random_state, shuffle=True)

    return train_graphs, test_graphs, train_labels, test_labels

# Split dataset into training (70%) and temporary (30%) sets
train_graphs, temp_graphs, train_labels, temp_labels = random_train_test_split(
    merged_graphs, labels, test_size=0.3, random_state=42)

# Further split the temporary set (30% of original) into validation (20%) and test sets (10%)
val_graphs, test_graphs, val_labels, test_labels = random_train_test_split(
    temp_graphs, temp_labels, test_size=0.33, random_state=42)

# Function to calculate the proportion of '1's in each label across the labels dataset
def calculate_label_proportions(labels):
    proportions = np.mean(labels == 1, axis=0)  # Calculate the proportion of '1's for each label
    return proportions

# Calculate the proportion of '1's in each subset's labels
train_proportions = calculate_label_proportions(train_labels)
val_proportions = calculate_label_proportions(val_labels)
test_proportions = calculate_label_proportions(test_labels)

# Convert labels to torch.Tensor format for compatibility with PyTorch models
train_labels = torch.tensor(train_labels)
val_labels = torch.tensor(val_labels)
test_labels = torch.tensor(test_labels)

# Print the size of each subset
print(f"Training set: {len(train_graphs)} graphs")
print(f"Validation set: {len(val_graphs)} graphs")
print(f"Test set: {len(test_graphs)} graphs")

# Print the proportion of '1's in each label for each subset
print("Proportion of '1's for each label in training set:", train_proportions)
print("Proportion of '1's for each label in validation set:", val_proportions)
print("Proportion of '1's for each label in test set:", test_proportions)


Training set: 4256 graphs
Validation set: 1222 graphs
Test set: 602 graphs
Proportion of '1's for each label in training set: [0.06343985 0.06414474 0.57307331 0.10103383 0.30451128]
Proportion of '1's for each label in validation set: [0.06873977 0.06792144 0.57774141 0.09165303 0.30032733]
Proportion of '1's for each label in test set: [0.04318937 0.06644518 0.57807309 0.09966777 0.3255814 ]


#  Graph Transformer Network Model Architecture

In [2]:
import torch
import torch.nn as nn
from torch_geometric.nn import global_add_pool
from torch.nn import TransformerEncoder, TransformerEncoderLayer
import torch.optim as optim
from sklearn.metrics import f1_score, recall_score, precision_score
from tqdm import tqdm
from torch_geometric.loader import DataLoader
from torch.optim.lr_scheduler import ReduceLROnPlateau

# Set random seed for reproducibility across random, numpy, and torch
def set_seed(seed: int) -> None:
    import random
    import numpy as np
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)  # If using GPU

set_seed(42)  # Set the random seed

# Custom Transformer Encoder Layer to return attention weights
class CustomTransformerEncoderLayer(TransformerEncoderLayer):
    def forward(self, src: torch.Tensor) -> tuple:
        # Calculate self-attention and return attention weights
        attn_output, attn_weights = self.self_attn(src, src, src)
        src = src + self.dropout1(attn_output)
        src = self.norm1(src)
        
        # Apply feed-forward layer
        ff_output = self.linear2(self.dropout(self.activation(self.linear1(src))))
        src = src + self.dropout2(ff_output)
        src = self.norm2(src)

        return src, attn_weights

# Transformer-based model definition
class TransformerModel(nn.Module):
    def __init__(self, in_dim: int, hidden_dim: int, out_dim: int, num_heads: int, num_layers: int, dropout: float = 0.3, dosage_weight: float = 1.0):
        super(TransformerModel, self).__init__()
        self.dosage_weight = dosage_weight  # Weight for dosage feature
        self.num_layers = num_layers  # Store number of layers

        # Embedding layer
        self.embedding = nn.Linear(in_dim, hidden_dim)
        self.norm = nn.LayerNorm(hidden_dim)

        # Define the Transformer encoder layers
        encoder_layers = CustomTransformerEncoderLayer(d_model=hidden_dim, nhead=num_heads, dim_feedforward=hidden_dim, dropout=dropout, batch_first=True)
        self.transformer_encoder = TransformerEncoder(encoder_layers, num_layers=num_layers)
        
        # Global pooling
        self.global_pool = global_add_pool
        
        # Fully connected output layer
        self.fc = nn.Linear(hidden_dim, out_dim)
        
        # Dropout layer for regularization
        self.dropout = nn.Dropout(p=dropout)

        # Initialize weights
        self._initialize_weights()

    def _initialize_weights(self) -> None:
        nn.init.xavier_uniform_(self.embedding.weight)
        nn.init.xavier_uniform_(self.fc.weight)
        if self.embedding.bias is not None:
            nn.init.zeros_(self.embedding.bias)
        if self.fc.bias is not None:
            nn.init.zeros_(self.fc.bias)

    def forward(self, g) -> tuple:
        x = g.x.clone()  # Clone input to avoid modifying the original data
        x[:, 90] *= self.dosage_weight  # Apply dosage weight to feature at index 90

        h = self.embedding(x)
        h = self.norm(h)

        # Transformer encoder, return attention weights
        attn_weights_list = []
        for _ in range(self.num_layers):
            h, attn_weights = self.transformer_encoder.layers[_](h)
            attn_weights_list.append(attn_weights)
        
        # Apply global pooling over node features to get graph-level features
        hg = self.global_pool(h, g.batch)

        hg = self.dropout(hg)
        out = self.fc(hg)
        
        # Return the output, attention weights, and edge indices
        edge_indices = g.edge_index  # Get edge indices from the graph data
        return out, attn_weights_list, edge_indices

# Model parameter configuration
in_dim = 91          # Input feature dimension
hidden_dim = 128     # Hidden layer dimension
out_dim = 5          # Output dimension (e.g., number of classes)
num_heads = 16       # Number of attention heads in Transformer
num_layers = 1       # Number of Transformer encoder layers
dropout = 0.3        # Dropout rate
dosage_weight = 1    # Dosage weight for specific feature

# Instantiate the Transformer model
model = TransformerModel(in_dim, hidden_dim, out_dim, num_heads, num_layers, dropout, dosage_weight=dosage_weight)
print(model)


TransformerModel(
  (embedding): Linear(in_features=91, out_features=128, bias=True)
  (norm): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
  (transformer_encoder): TransformerEncoder(
    (layers): ModuleList(
      (0): CustomTransformerEncoderLayer(
        (self_attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=128, out_features=128, bias=True)
        )
        (linear1): Linear(in_features=128, out_features=128, bias=True)
        (dropout): Dropout(p=0.3, inplace=False)
        (linear2): Linear(in_features=128, out_features=128, bias=True)
        (norm1): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
        (norm2): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
        (dropout1): Dropout(p=0.3, inplace=False)
        (dropout2): Dropout(p=0.3, inplace=False)
      )
    )
  )
  (fc): Linear(in_features=128, out_features=5, bias=True)
  (dropout): Dropout(p=0.3, inplace=False)
)


# Model Training

In [None]:
import torch
import numpy as np
from torch import nn, optim
from torch.utils.data import DataLoader
from tqdm import tqdm
from sklearn.metrics import recall_score, f1_score, precision_score
from torch.optim.lr_scheduler import ReduceLROnPlateau

# Compute class weights to handle class imbalance in labels
num_classes = train_labels.size(1)
pos_counts = train_labels.sum(dim=0)
neg_counts = train_labels.size(0) - pos_counts
pos_weight = neg_counts / (pos_counts + 1e-6)

# Define the loss function with class weights, optimizer, and learning rate scheduler
loss_fn = nn.BCEWithLogitsLoss(pos_weight=pos_weight)
learning_rate = 0.0007
optimizer = optim.Adam(model.parameters(), lr=learning_rate, weight_decay=1e-4)
scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=2)

# Early stopping mechanism to halt training if validation loss doesn't improve
class EarlyStopping:
    def __init__(self, patience: int = 5, min_delta: float = 0.001):
        self.patience = patience
        self.min_delta = min_delta
        self.best_loss = None
        self.counter = 0
        self.early_stop = False

    def __call__(self, val_loss: float) -> None:
        if self.best_loss is None:
            self.best_loss = val_loss
        elif val_loss > self.best_loss - self.min_delta:
            self.counter += 1
            if self.counter >= self.patience:
                self.early_stop = True
        else:
            self.best_loss = val_loss
            self.counter = 0

# Print the current learning rate from the optimizer
def print_learning_rate(optimizer) -> None:
    for param_group in optimizer.param_groups:
        print(f"Current Learning Rate: {param_group['lr']}")

# Data loading function to create batches of graphs and labels
def create_batches(graphs, labels, batch_size: int) -> DataLoader:
    for i, graph in enumerate(graphs):
        graph.y = labels[i]  # Attach labels to graph data
    return DataLoader(graphs, batch_size=batch_size, shuffle=True)

# Function to train the model
def train_model(train_graphs, train_labels, val_graphs, val_labels, model, loss_fn, optimizer, scheduler, num_epochs: int = 50, batch_size: int = 32, early_stopping_patience: int = 5) -> None:
    train_loader = create_batches(train_graphs, train_labels, batch_size)
    val_loader = create_batches(val_graphs, val_labels, batch_size)
    
    early_stopping = EarlyStopping(patience=early_stopping_patience)
    
    for epoch in range(num_epochs):
        model.train()
        total_loss = 0
        for batch in tqdm(train_loader, desc=f"Epoch {epoch + 1}/{num_epochs}"):
            batch_graphs = batch
            batch_labels = batch.y
            
            # Forward pass
            output, attn_weights, edge_indices = model(batch_graphs)  # Unpack model's output
            batch_labels = batch_labels.view(output.shape)
            
            # Compute loss
            loss = loss_fn(output, batch_labels)
            
            # Backward pass and optimization
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            
            total_loss += loss.item()
        
        avg_loss = total_loss / len(train_loader)
        print(f"Epoch {epoch + 1}/{num_epochs}, Training Loss: {avg_loss:.4f}")
        
        # Validate the model
        model.eval()
        val_loss = 0
        val_predictions = []
        val_targets = []
        with torch.no_grad():
            for batch in val_loader:
                batch_graphs = batch
                batch_labels = batch.y
                output, attn_weights, edge_indices = model(batch_graphs)  # Unpack model's output
                batch_labels = batch_labels.view(output.shape)
                loss = loss_fn(output, batch_labels)
                val_loss += loss.item()
                
                val_predictions.extend(torch.sigmoid(output).round().cpu().numpy())
                val_targets.extend(batch_labels.cpu().numpy())
        
        avg_val_loss = val_loss / len(val_loader)
        
        # Calculate recall, F1 score, and precision for validation set
        recall = recall_score(val_targets, val_predictions, average='micro')
        f1 = f1_score(val_targets, val_predictions, average='micro')
        precision = precision_score(val_targets, val_predictions, average='micro')
        
        print(f"Epoch {epoch + 1}/{num_epochs}, Validation Loss: {avg_val_loss:.4f}, "
              f"Validation Recall: {recall:.4f}, Validation F1: {f1:.4f}, Validation Precision: {precision:.4f}")
        
        # Step the learning rate scheduler
        scheduler.step(avg_val_loss)
        print_learning_rate(optimizer)
        
        # Check early stopping
        early_stopping(avg_val_loss)
        if early_stopping.early_stop:
            print("Early stopping triggered")
            break

# Train the model
train_model(train_graphs, train_labels, val_graphs, val_labels, model, loss_fn, optimizer, scheduler, num_epochs=50, batch_size=128)


# Model Evaluation

In [None]:
import torch
import numpy as np
from sklearn.metrics import roc_curve, auc, precision_recall_fscore_support, accuracy_score, confusion_matrix
import pandas as pd
import os
from matplotlib import rcParams

# Set global font to Arial for consistent plotting
rcParams['font.family'] = 'Arial'

# Function to evaluate the model on a specific dataset and optionally output attention weights
def evaluate_model(graphs, labels, model, output_dir, data_name, cpm_id=None):
    model.eval()  # Set model to evaluation mode
    all_outputs = []
    all_labels = []
    all_attn_weights = []
    all_edge_indices = []  # New: store edge indices for each graph

    # Collect predictions, labels, attention weights, and edge indices for the dataset
    with torch.no_grad():
        for i, graph in enumerate(graphs):
            output, attn_weights, edge_indices = model(graph)  # Model returns output, attention weights, and edge indices
            all_outputs.append(output.cpu().numpy())
            all_labels.append(labels[i].cpu().numpy())
            all_attn_weights.append(attn_weights)
            all_edge_indices.append(edge_indices)  # Store edge indices

    final_outputs = np.vstack(all_outputs)
    final_labels = np.vstack(all_labels)

    # Calculate and save performance metrics
    compute_and_save_metrics(final_labels, final_outputs, output_dir, data_name)

    # Save attention weights for a specific `cpm_id`, if provided
    if cpm_id is not None:
        output_attention_weights(all_attn_weights, all_edge_indices, graphs, cpm_id, output_dir)

# Function to output attention weights with edge indices
def output_attention_weights(all_attn_weights, all_edge_indices, graphs, cpm_id, output_dir):
    for i, graph in enumerate(graphs):
        if hasattr(graph, 'cpm_id') and graph.cpm_id == cpm_id:  # Find the graph with the specified `cpm_id`
            attn_weights = all_attn_weights[i]
            attn_weights_1 = attn_weights[0]  # Assume correct indexing for the first set of attention weights
            attn_weights_1_array = attn_weights_1.cpu().numpy()

            # Extract edge indices
            edge_indices = all_edge_indices[i].cpu().numpy()

            # Get node names
            node_names = graph.node_names  # Ensure this attribute is available

            # Transpose weights for ease of reading
            transposed_weights = attn_weights_1_array.T

            # Merge node names with attention weights for easy analysis
            merged_array = np.column_stack((node_names, transposed_weights))

            # Save as CSV
            np.savetxt(os.path.join(output_dir, f'{cpm_id}_attn_weights-transform.csv'), merged_array, delimiter=',', fmt='%s')
            print(f"Attention weights saved as {cpm_id}_attn_weights-transform.csv")

# Function to compute and save various performance metrics and ROC data
def compute_and_save_metrics(labels, outputs, output_dir, data_name):
    num_classes = labels.shape[1]
    metrics = {
        'Class': [],
        'Precision': [],
        'Recall': [],
        'F1 Score': [],
        'AUC': [],
        'Accuracy': [],
        'Specificity': []
    }
    roc_data_long_format = {'Class': [], 'Reference': [], 'Predicted': []}
    
    for i in range(num_classes):
        # Apply sigmoid to convert logits to probabilities
        probabilities = torch.sigmoid(torch.tensor(outputs))
        
        # ROC curve and AUC calculation
        fpr, tpr, thresholds = roc_curve(labels[:, i], probabilities[:, i].numpy())
        roc_auc = auc(fpr, tpr)
        
        # Store ROC data in long format for each class
        for ref, pred in zip(labels[:, i], probabilities[:, i].numpy()):
            roc_data_long_format['Class'].append(f'Class_{i+1}')
            roc_data_long_format['Reference'].append(ref)
            roc_data_long_format['Predicted'].append(pred)
        
        # Calculate Precision, Recall, F1, Accuracy, and Specificity
        pred_binary = (probabilities[:, i] > 0.5).numpy().astype(int)
        precision, recall, f1, _ = precision_recall_fscore_support(labels[:, i], pred_binary, average='binary')
        accuracy = accuracy_score(labels[:, i], pred_binary)
        
        tn, fp, fn, tp = confusion_matrix(labels[:, i], pred_binary).ravel()
        specificity = tn / (tn + fp) if (tn + fp) > 0 else 0

        # Store metrics for each class
        metrics['Class'].append(f'Class_{i+1}')
        metrics['Precision'].append(precision)
        metrics['Recall'].append(recall)
        metrics['F1 Score'].append(f1)
        metrics['AUC'].append(roc_auc)
        metrics['Accuracy'].append(accuracy)
        metrics['Specificity'].append(specificity)
    
    # Calculate average metrics across all classes
    avg_metrics = {
        'Class': ['Average'],
        'Precision': [np.mean(metrics['Precision'])],
        'Recall': [np.mean(metrics['Recall'])],
        'F1 Score': [np.mean(metrics['F1 Score'])],
        'AUC': [np.mean(metrics['AUC'])],
        'Accuracy': [np.mean(metrics['Accuracy'])],
        'Specificity': [np.mean(metrics['Specificity'])]
    }
    
    # Append average metrics to the metrics dictionary
    for key in metrics:
        metrics[key].append(avg_metrics[key][0])
    
    # Save ROC data in long format to CSV
    roc_df_long = pd.DataFrame(roc_data_long_format)
    roc_df_long.to_csv(os.path.join(output_dir, f'{data_name}_roc_data_transform.csv'), index=False)

    # Save metrics data to CSV
    metrics_df = pd.DataFrame(metrics)
    metrics_df.to_csv(os.path.join(output_dir, f'{data_name}_metrics_transform.csv'), index=False)
    print(f"Metrics and ROC data saved to {output_dir}.")

# Set working directory and define paths for input/output data
work_dir = os.getcwd()
input_data_dir = os.path.join(work_dir, '../Data')
output_dir = os.path.join(work_dir, '../Data')

# Evaluate on training, validation, or test sets
# evaluate_model(train_graphs, train_labels, model, output_dir, "train")
# evaluate_model(val_graphs, val_labels, model, output_dir, "validation")

# Evaluate on test set with a specific `cpm_id`
evaluate_model(test_graphs, test_labels, model, output_dir, "test", cpm_id='CPM05651')
