In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip install -q torch_geometric

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/63.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m63.1/63.1 kB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.1 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m41.5 MB/s[0m eta [36m0:00:00[0m
[?25h

In [3]:
# GTN using multi-omics data (mRNA, miRNA and DNA methylation) with correlation matrix graph structure (5 fold cross validation)
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch_geometric.data import Data
from torch_geometric.nn import TransformerConv
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import KFold
from sklearn.metrics import precision_score, recall_score, accuracy_score, f1_score
import torch.nn.functional as F
from torch.optim.lr_scheduler import ReduceLROnPlateau
import datetime
now = datetime.datetime.now

# Check if GPU is available and set device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

Using device: cuda


In [4]:
# Step 1: Load the PPI data
ppi_file_path = 'drive/My Drive/Projects/Gene_Expression_Project/PPI.csv'
ppi_df = pd.read_csv(ppi_file_path)

In [5]:
# Step 2: Concatenate 'stringId_A' and 'stringId_B' to calculate the number of connections (degree)
all_proteins = pd.concat([ppi_df['stringId_A'], ppi_df['stringId_B']])

# Step 3: Count the number of connections for each protein
protein_connections = all_proteins.value_counts()

# Step 4: Define a degree threshold to select only highly connected proteins (e.g., 200 or more connections)
degree_threshold = 200
high_degree_proteins = protein_connections[protein_connections >= degree_threshold].index

# Step 5: Filter the PPI data to include only edges where both proteins have a high number of connections
ppi_filtered = ppi_df[
    ppi_df['stringId_A'].isin(high_degree_proteins) &
    ppi_df['stringId_B'].isin(high_degree_proteins)
]

# Step 6: Map the high-degree proteins to unique node IDs
proteins = pd.concat([ppi_filtered['stringId_A'], ppi_filtered['stringId_B']]).unique()
protein_to_id = {protein: idx for idx, protein in enumerate(proteins)}

# Step 7: Create edge index (this will be the input for GTN)
edges = ppi_filtered[['stringId_A', 'stringId_B']].map(lambda x: protein_to_id[x])
edge_index = torch.tensor(edges.values.T, dtype=torch.long).to(device)

In [6]:
# Step 8: Load and preprocess the multi-omics data
!wget https://www.webpages.uidaho.edu/vakanski/Codes_Data/mRNA_miRNA_Meth_integrated.csv
file_path = 'mRNA_miRNA_Meth_integrated.csv'
df = pd.read_csv(file_path)
df.drop(df.columns[0], axis=1, inplace=True)
Y = df.iloc[:, -1].copy()

# Remove non-numeric columns
df = df.select_dtypes(include=[np.number])
X = df.values

num_classes = len(set(Y))
print("Number of classes:", num_classes)
num_samples = X.shape[0]
print("Number of samples:", num_samples)
num_Features = X.shape[1]
print("Number of Features:", num_Features)

# Standardize the features
scaler = StandardScaler()
X = scaler.fit_transform(X)

# Encode labels
label_encoder = LabelEncoder()
labels = label_encoder.fit_transform(Y)

--2025-03-07 19:08:26--  https://www.webpages.uidaho.edu/vakanski/Codes_Data/mRNA_miRNA_Meth_integrated.csv
Resolving www.webpages.uidaho.edu (www.webpages.uidaho.edu)... 129.101.105.230
Connecting to www.webpages.uidaho.edu (www.webpages.uidaho.edu)|129.101.105.230|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 123599052 (118M) [application/octet-stream]
Saving to: ‘mRNA_miRNA_Meth_integrated.csv’


2025-03-07 19:08:34 (16.9 MB/s) - ‘mRNA_miRNA_Meth_integrated.csv’ saved [123599052/123599052]

Number of classes: 32
Number of samples: 8464
Number of Features: 2793


In [7]:
# Step 9: Define the GTN model
class GTN(nn.Module):
    def __init__(self, num_features, num_classes):
        super(GTN, self).__init__()
        self.conv1 = TransformerConv(num_features, 128, heads=8)  # Output: 128 * 8 = 1024
        self.conv2 = TransformerConv(1024, num_classes, heads=1)  # Input: 1024, Output: num_classes * 1

    def forward(self, data):
        x, edge_index = data.x, data.edge_index
        x = self.conv1(x, edge_index)
        x = self.conv2(x, edge_index)
        return x

# Step 10: Set up K-fold cross-validation
k = 5
kf = KFold(n_splits=k, shuffle=True)

# Initialize lists to store metrics for each fold
precision_scores = []
recall_scores = []
accuracy_scores = []
F1Measure = []

# Set hyperparameters
hidden_feats = 1024
num_layers = 2
dropout = 0.5
lr = 0.001
weight_decay = 0
num_epochs = 100

In [8]:
# Step 11: Training and Evaluation
t = now()
for train_index, test_index in kf.split(X):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = labels[train_index], labels[test_index]

    # Convert labels to tensors within the loop
    y_train = torch.tensor(y_train, dtype=torch.long).to(device)
    y_test = torch.tensor(y_test, dtype=torch.long).to(device)

    # Calculate the correlation matrix and convert it to an edge index
    correlation_matrix_train = np.corrcoef(X_train, rowvar=True)
    correlation_matrix_test = np.corrcoef(X_test, rowvar=True)

    # Create edge indices based on a correlation threshold
    edge_index_train = torch.tensor(np.argwhere((correlation_matrix_train >= 0.9) | (correlation_matrix_train <= -0.9)).T, dtype=torch.long).to(device)
    edge_index_test = torch.tensor(np.argwhere((correlation_matrix_test >= 0.9) | (correlation_matrix_test <= -0.9)).T, dtype=torch.long).to(device)

    # Prepare training and testing data
    train_data = Data(x=torch.tensor(X_train, dtype=torch.float32).to(device), edge_index=edge_index_train, y=y_train)
    test_data = Data(x=torch.tensor(X_test, dtype=torch.float32).to(device), edge_index=edge_index_test, y=y_test)

    # Initialize the model, criterion, optimizer, and scheduler
    model = GTN(X.shape[1], len(np.unique(labels))).to(device)
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=0.001)  # Adjusted learning rate
    scheduler = ReduceLROnPlateau(optimizer, mode='max', factor=0.1, patience=10)

    # Training loop
    num_epochs = 100
    for epoch in range(num_epochs):
        model.train()
        optimizer.zero_grad()
        out = model(train_data)
        loss = criterion(out, train_data.y)
        loss.backward()
        optimizer.step()

        # Validation
        model.eval()
        with torch.no_grad():
            out = model(test_data)
            pred = out.argmax(dim=1)
            acc = accuracy_score(y_test.cpu().numpy(), pred.cpu().numpy())
            # print(f'Epoch [{epoch + 1}/{num_epochs}], Loss: {loss.item():.4f}, Validation Accuracy: {acc:.4f}')

    # Testing
    model.eval()
    with torch.no_grad():
        out = model(test_data)
        pred = out.argmax(dim=1)
        acc = accuracy_score(y_test.cpu().numpy(), pred.cpu().numpy())
        precision = precision_score(y_test.cpu().numpy(), pred.cpu().numpy(), average='macro')
        recall = recall_score(y_test.cpu().numpy(), pred.cpu().numpy(), average='macro')
        f1 = f1_score(y_test.cpu().numpy(), pred.cpu().numpy(), average='macro')

        accuracy_scores.append(acc)
        precision_scores.append(precision)
        recall_scores.append(recall)
        F1Measure.append(f1)
print('Training time: %s' % (now() - t))

Training time: 0:00:58.167997


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [9]:
# Calculate the average metrics across all folds
average_accuracy = np.mean(accuracy_scores)
average_precision = np.mean(precision_scores)
average_recall = np.mean(recall_scores)
average_f1 = np.mean(F1Measure)

print("Average accuracy =", average_accuracy)
print("Accuracy std sev =", np.std(accuracy_scores))
print("Average precision =", average_precision)
print("Precision std sev =", np.std(precision_scores))
print("Average recall =", average_recall)
print("Recall std sev =", np.std(recall_scores))
print("Average F1 score =", average_f1)
print("F1 std dev =", np.std(F1Measure))

Average accuracy = 0.9417534864041757
Accuracy std sev = 0.005470661962832667
Average precision = 0.9274269662489297
Precision std sev = 0.014940605006746219
Average recall = 0.9146926622751235
Recall std sev = 0.018925085638013497
Average F1 score = 0.9180457610451503
F1 std dev = 0.01783253028526419
