In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install -q torch_geometric

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m63.1/63.1 kB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m18.2 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
# GTN using multi-omics data (mRNA, miRNA and DNA methylation) with PPI graph structure (5 fold cross validation)
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch_geometric.data import Data
from torch_geometric.nn import TransformerConv
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import KFold
from sklearn.metrics import precision_score, recall_score, accuracy_score, f1_score
import torch.nn.functional as F
from torch.optim.lr_scheduler import ReduceLROnPlateau
import datetime
now = datetime.datetime.now

# Check if GPU is available and set device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

Using device: cuda


In [None]:
# Step 1: Load the PPI data
ppi_file_path = 'drive/My Drive/Projects/Gene_Expression_Project/PPI.csv'
ppi_df = pd.read_csv(ppi_file_path)

# Step 2: Concatenate 'stringId_A' and 'stringId_B' to calculate the number of connections (degree)
all_proteins = pd.concat([ppi_df['stringId_A'], ppi_df['stringId_B']])

# Step 3: Count the number of connections for each protein
protein_connections = all_proteins.value_counts()

# Step 4: Define a degree threshold to select only highly connected proteins (e.g., 200 or more connections)
degree_threshold = 200
high_degree_proteins = protein_connections[protein_connections >= degree_threshold].index

# Step 5: Filter the PPI data to include only edges where both proteins have a high number of connections
ppi_filtered = ppi_df[
    ppi_df['stringId_A'].isin(high_degree_proteins) &
    ppi_df['stringId_B'].isin(high_degree_proteins)
]

# Step 6: Map the high-degree proteins to unique node IDs
proteins = pd.concat([ppi_filtered['stringId_A'], ppi_filtered['stringId_B']]).unique()
protein_to_id = {protein: idx for idx, protein in enumerate(proteins)}

# Step 7: Create edge index (this will be the input for GTN)
edges = ppi_filtered[['stringId_A', 'stringId_B']].map(lambda x: protein_to_id[x])
edge_index = torch.tensor(edges.values.T, dtype=torch.long).to(device)

In [None]:
# Step 8: Load and preprocess the multi-omics data
!wget https://www.webpages.uidaho.edu/vakanski/Codes_Data/mRNA_miRNA_Meth_integrated.csv
file_path = 'mRNA_miRNA_Meth_integrated.csv'
df = pd.read_csv(file_path)
df.drop(df.columns[0], axis=1, inplace=True)
Y = df.iloc[:, -1].copy()

# Remove non-numeric columns
df = df.select_dtypes(include=[np.number])
X = df.values

num_classes = len(set(Y))
print("Number of classes:", num_classes)
num_samples = X.shape[0]
print("Number of samples:", num_samples)
num_Features = X.shape[1]
print("Number of Features:", num_Features)

# Standardize the features
scaler = StandardScaler()
X = scaler.fit_transform(X)

# Encode labels
label_encoder = LabelEncoder()
Y = label_encoder.fit_transform(Y)

# Convert data to PyTorch tensors and move to device
X = torch.tensor(X, dtype=torch.float).to(device)
Y = torch.tensor(Y, dtype=torch.long).to(device)

--2025-03-07 18:48:58--  https://www.webpages.uidaho.edu/vakanski/Codes_Data/mRNA_miRNA_Meth_integrated.csv
Resolving www.webpages.uidaho.edu (www.webpages.uidaho.edu)... 129.101.105.230
Connecting to www.webpages.uidaho.edu (www.webpages.uidaho.edu)|129.101.105.230|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 123599052 (118M) [application/octet-stream]
Saving to: ‘mRNA_miRNA_Meth_integrated.csv’


2025-03-07 18:49:03 (31.7 MB/s) - ‘mRNA_miRNA_Meth_integrated.csv’ saved [123599052/123599052]

Number of classes: 32
Number of samples: 8464
Number of Features: 2793


In [None]:
# Step 9: Create PyTorch Geometric data object using the edge_index from the filtered PPI network
data = Data(x=X, edge_index=edge_index)

# Step 10: Define the GTN model
class GTN(nn.Module):
    def __init__(self, num_features, num_classes, hidden_feats=1024, num_layers=2, dropout=0.5):
        super(GTN, self).__init__()
        self.convs = nn.ModuleList()
        self.convs.append(TransformerConv(num_features, hidden_feats, heads=1, dropout=dropout))
        for _ in range(num_layers - 1):
            self.convs.append(TransformerConv(hidden_feats, hidden_feats, heads=1, dropout=dropout))
        self.fc = nn.Linear(hidden_feats, num_classes)
        self.dropout = nn.Dropout(p=dropout)

    def forward(self, data):
        x, edge_index = data.x, data.edge_index
        for conv in self.convs:
            x = conv(x, edge_index)
            x = F.relu(x)
            x = self.dropout(x)
        x = self.fc(x)
        return F.log_softmax(x, dim=1)

# Step 11: Set up K-fold cross-validation
k = 5
kf = KFold(n_splits=k, shuffle=True)

# Initialize lists to store metrics for each fold
precision_scores = []
recall_scores = []
accuracy_scores = []
F1Measure = []

# Set hyperparameters
hidden_feats = 1024
num_layers = 2
dropout = 0.5
lr = 0.001
weight_decay = 0
num_epochs = 100

In [None]:
# Step 12: Training and Evaluation
t = now()
for train_index, test_index in kf.split(X.cpu()):
    # Split the data into training and testing sets
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = Y[train_index], Y[test_index]

    # Create train/test data using the same PPI edge_index
    train_data = Data(x=X_train, edge_index=edge_index)
    test_data = Data(x=X_test, edge_index=edge_index)

    # Create the GTN model
    model = GTN(
        num_features=X.shape[1],
        num_classes=len(set(Y.cpu().numpy())),
        hidden_feats=hidden_feats,
        num_layers=num_layers,
        dropout=dropout
    ).to(device)
    optimizer = optim.Adam(model.parameters(), lr=lr, weight_decay=weight_decay)
    scheduler = ReduceLROnPlateau(optimizer, mode='max', factor=0.1, patience=10)
    criterion = nn.NLLLoss()

    for epoch in range(num_epochs):
        model.train()
        optimizer.zero_grad()
        out = model(train_data)
        loss = criterion(out, y_train)
        loss.backward()
        optimizer.step()

        # Validation
        model.eval()
        with torch.no_grad():
            logits = model(test_data)
            pred = torch.argmax(logits, dim=1)
            acc = accuracy_score(y_test.cpu().numpy(), pred.cpu().numpy())
            # print(f'Epoch [{epoch + 1}/{num_epochs}], Loss: {loss.item():.4f}, Validation Accuracy: {acc:.4f}')
            scheduler.step(acc)

    # Testing
    model.eval()
    with torch.no_grad():
        logits = model(test_data)
        pred = torch.argmax(logits, dim=1)
        test_acc = accuracy_score(y_test.cpu().numpy(), pred.cpu().numpy())
        precision = precision_score(y_test.cpu().numpy(), pred.cpu().numpy(), average='macro', zero_division=1)
        recall = recall_score(y_test.cpu().numpy(), pred.cpu().numpy(), average='macro')
        f1 = f1_score(y_test.cpu().numpy(), pred.cpu().numpy(), average='macro')

        accuracy_scores.append(test_acc)
        precision_scores.append(precision)
        recall_scores.append(recall)
        F1Measure.append(f1)
print('Training time: %s' % (now() - t))

Training time: 0:01:12.276266


In [None]:
# Calculate the average metrics across all folds
average_accuracy = np.mean(accuracy_scores)
average_precision = np.mean(precision_scores)
average_recall = np.mean(recall_scores)
average_f1 = np.mean(F1Measure)

print("Average accuracy =", average_accuracy)
print("Accuracy std sev =", np.std(accuracy_scores))
print("Average precision =", average_precision)
print("Precision std sev =", np.std(precision_scores))
print("Average recall =", average_recall)
print("Recall std sev =", np.std(recall_scores))
print("Average F1 score =", average_f1)
print("F1 std dev =", np.std(F1Measure))

Average accuracy = 0.9520330550354051
Accuracy std sev = 0.004433865609198085
Average precision = 0.9446092787639447
Precision std sev = 0.007422989624697369
Average recall = 0.9209231654365416
Recall std sev = 0.013145008384676457
Average F1 score = 0.9230865539887884
F1 std dev = 0.012157202620385159
