<a href="https://colab.research.google.com/github/Favourj-bit/gsoc_2023_pytorch_pathway_commons/blob/main/Modelling_with_Breast_Cancer_Data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
!pip install torch-geometric

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting torch-geometric
  Downloading torch_geometric-2.3.1.tar.gz (661 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m661.6/661.6 kB[0m [31m21.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Building wheels for collected packages: torch-geometric
  Building wheel for torch-geometric (pyproject.toml) ... [?25l[?25hdone
  Created wheel for torch-geometric: filename=torch_geometric-2.3.1-py3-none-any.whl size=910459 sha256=cbdac7c0dbacbba9caf7d840fc8c38cef34cb9b5d52825632d0f52e4ba3af944
  Stored in directory: /root/.cache/pip/wheels/ac/dc/30/e2874821ff308ee67dcd7a66dbde912411e19e35a1addda028
Successfully built torch-geometric
Installing collected packages: torch-geometric
Successfully installed torch-geomet

# Importing Data and Libraries

In [4]:
import pandas as pd
import numpy as np
import torch
from torch_geometric.data import Data
from sklearn.model_selection import train_test_split

In [5]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using device:', device)

if device.type != 'cuda':
    raise SystemError('GPU device not found')

Using device: cuda


In [6]:
Brca_patients = pd.read_csv('/content/Brca_patients')

FileNotFoundError: ignored

In [None]:
Brca_patients = Brca_patients.set_index('Sample Identifier')
Brca_patients.index.name = None
Brca_patients.head(2)

In [None]:
#defining features and labels

y = np.array(Brca_patients['Overall Survival (Months)'], dtype=float)
X = Brca_patients.drop('Overall Survival (Months)', axis=1)

In [None]:
path = "/content/drive/MyDrive/gsoc data/PathwayCommons12.reactome.hgnc.sif.gz"


#specify compression type because the file is compressed
df = pd.read_csv(path, sep="\t", compression="gzip", header=None, names=["Source", "InteractionType", "Target"])

In [None]:
df = df.drop(columns='InteractionType')

In [None]:
df.head()

# Selecting Common Nodes From Both Data

In [None]:
all_nodes = sorted(list(set(df['Source']) | set(df['Target'])))
len(all_nodes)

In [None]:
len(set(all_nodes) & set(X.columns))

In [None]:
used_nodes = sorted(list(set(X.columns) & set(all_nodes)))
len(used_nodes)

In [None]:
X = X.loc[:, used_nodes]
X.head(2)

In [None]:
# Extract the values from the 'Source' column of the DataFrame
source_values = df['Source'].values

# Create a boolean mask indicating whether each element in 'source_values' is present in 'used_nodes'
mask = np.isin(source_values, used_nodes)

df = df.loc[mask]

# Update 'source_values' with the values from the 'Target' column of the filtered DataFrame
source_values = df['Target'].values

# Create a new boolean mask based on the updated 'source_values' array
mask = np.isin(source_values, used_nodes)


df = df.loc[mask]

# Reset the index of the filtered DataFrame, dropping the old index
df = df.reset_index(drop=True)

In [None]:
len(set(df['Source']) | set(df['Target']))

In [None]:

# Select the columns from DataFrame 'X' that correspond to unique values
columns_to_select = sorted(set(df['Source']).union(set(df['Target'])))
X = X[columns_to_select]
X.head(2)

# Creating Edge Index from Pathway Commons

In [None]:
source_nodes = df['Source'].tolist()
target_nodes = df['Target'].tolist()


# Create a dictionary to map each unique node to a unique index
node_to_index = {node: index for index, node in enumerate(X.columns)}

# Map the source and target nodes to their corresponding indices
source_indices = [node_to_index[node] for node in source_nodes]
target_indices = [node_to_index[node] for node in target_nodes]

In [None]:
# Convert the source and target indices to a PyTorch tensor
edge_index = torch.tensor([source_indices, target_indices], dtype=torch.long)

In [None]:
edge_index

In [None]:
edge_index.shape

# Splitting the Data into Train and Test Splits

In [None]:
# Set the fixed index for splitting
split_index = int(0.8 * X.shape[0])

# Split the data based on the fixed index
X_train = X[:split_index]
X_test = X[split_index:]
y_train = y[:split_index]
y_test = y[split_index:]

In [None]:
X_train.shape, X_test.shape

In [None]:
X_train = X_train.values
X_test = X_test.values

In [None]:
X_train[0]

In [None]:
X_test[1]

In [None]:
y_train

# Generating patient-specific graphs

In [None]:
num_patients_train = X_train.shape[0]
num_patients_test = X_test.shape[0]

# Create patient-specific graphs for the training set
graphs_train = []
for i in range(num_patients_train):
    node_features = X_train[i]  # Node features for the i-th patient
    target = y_train[i]  # Target label for the i-th patient
    graph_train = (node_features, edge_index, target)
    graphs_train.append(graph_train)

# Create patient-specific graphs for the test set
graphs_test = []
for i in range(num_patients_test):
    node_features = X_test[i]  # Node features for the i-th patient
    target = y_test[i]  # Target label for the i-th patient
    graph_test = (node_features, edge_index, target)
    graphs_test.append(graph_test)

In [None]:
# Check the number of patient-specific graphs
print(len(graphs_train))  # Should be 62
print(len(graphs_test))  # Should be 16

In [None]:
# Access the patient-specific graph for a specific patient in the training set
patient_index = 0 # Index of the patient
node_features, edge_index, y = graphs_train[patient_index]
print(node_features)  # Node features for the specific patient
print(edge_index)  # Edge index for the specific patient
print(y)

In [None]:
# Access the patient-specific graph for a specific patient in the training set
patient_index = 15  # Index of the patient
node_features, edge_index, y = graphs_test[patient_index]
print(node_features)  # Node features for the specific patient
print(edge_index)  # Edge index for the specific patient
print(y)

# Converting List of Graphs to Data Objects

In [None]:
# Convert graphs_train to a list of Data objects
data_train = [Data(x=torch.tensor(graph[0].reshape(len(graphs_train[0][0]), 1)), edge_index=graph[1], y=torch.tensor(graph[2])) for graph in graphs_train]

# Convert graphs_test to a list of Data objects
data_test = [Data(x=torch.tensor(graph[0].reshape(len(graphs_test[0][0]), 1)), edge_index=graph[1], y=torch.tensor(graph[2])) for graph in graphs_test]

In [None]:
graphs_train[0][0]

In [None]:
graphs_train[0]

In [None]:
data_test

In [None]:
# Access the attributes of a specific data object in the training set
sample = data_train[0]  # Get the first data object
print(sample)  # Print the data object

# Access the node features, edge indices, and target label
node_features = sample.x
edge_index = sample.edge_index
target = sample.y

print(node_features)  # Print the node features
print(edge_index)  # Print the edge indices
print(target)  # Print the target label

# Creating Train and Test Batches

In [None]:
from torch_geometric.loader import DataLoader

train_loader = DataLoader(data_train, batch_size=32, shuffle=True)
test_loader = DataLoader(data_test, batch_size=8, shuffle=False)

for step, data in enumerate(train_loader):
    print('Training Batches: ')
    print(f'Step {step + 1}:')
    print('=======')
    print(f'Number of graphs in the current batch: {data.num_graphs}')
    print(data)
    print()

In [None]:
for step, data in enumerate(test_loader):
    print('Test Batches: ')
    print(f'Step {step + 1}:')
    print('=======')
    print(f'Number of graphs in the current batch: {data.num_graphs}')
    print(data)
    print()

# Model Building and Evaluation

In [None]:
from torch.nn import Linear
import torch.nn.functional as F
from torch_geometric.nn import GCNConv
from torch_geometric.nn import global_mean_pool

GATConv

GraphNorm

Dropout for all layers

In [None]:
class GCN(torch.nn.Module):
    def __init__(self, hidden_channels, num_node_features):
        super(GCN, self).__init__()
        torch.manual_seed(12345)
        self.conv1 = GCNConv(num_node_features, hidden_channels)
        self.conv2 = GCNConv(hidden_channels, hidden_channels)
        self.conv3 = GCNConv(hidden_channels, hidden_channels)
        self.lin = Linear(hidden_channels, 1)  # Regression output with 1 dimension

    def forward(self, x, edge_index, batch):
        # 1. Obtain node embeddings
        x = self.conv1(x, edge_index)
        x = x.relu()
        x = self.conv2(x, edge_index)
        x = x.relu()
        x = self.conv3(x, edge_index)

        # 2. Readout layer
        x = global_mean_pool(x, batch)  # [batch_size, hidden_channels]

        # 3. Apply a final regression layer
        x = F.dropout(x, p=0.2, training=self.training)
        x = self.lin(x)

        return x.squeeze()  # Remove the extra dimension

In [None]:
model = GCN(hidden_channels=64, num_node_features=1)
criterion = torch.nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

model.double()  # Convert the model's parameters to Double type

num_epochs = 101  # Specify the number of epochs

for epoch in range(num_epochs):
    model.train()  # Set the model to train mode
    total_loss = 0

    for step, data in enumerate(train_loader):
        optimizer.zero_grad()

        out = model(data.x.double(), data.edge_index, data.batch)
        loss = criterion(out, data.y.view(-1).double())
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    average_loss = total_loss / (step + 1)

    # Evaluate on the test set
    model.eval()  # Set the model to evaluation mode
    with torch.no_grad():
        test_loss = 0
        for data in test_loader:
            out = model(data.x.double(), data.edge_index, data.batch)
            loss = criterion(out, data.y.view(-1).double())
            test_loss += loss.item()

        average_test_loss = test_loss / len(data_test)
        #print(f"Epoch: {epoch+1:03d}, Test Loss: {average_test_loss:.4f}")
        print(f'Epoch: {epoch:03d}, Train loss: {average_loss:.4f}, Test Loss: {average_test_loss:.4f}')

In [None]:
model.eval()  # Set the model to evaluation mode

predictions = []  # List to store the predicted outputs

with torch.no_grad():
    for data in test_loader:
        out = model(data.x.double(), data.edge_index, data.batch)
        predictions.append(out.detach().numpy())  # Convert the predictions to NumPy array

# Concatenate the predictions from multiple batches
predictions = np.concatenate(predictions)

# Print the predictions
print(predictions)

In [None]:
from sklearn.metrics import r2_score

# Convert the test data batches to a list of Data objects
test_data_batches = [
    Data(x=batch.x, edge_index=batch.edge_index, y=batch.y) for batch in test_data_batches
]

# Convert the predictions to PyTorch tensors
predictions = torch.tensor(predictions)

# Convert the ground truth labels of the test data to a PyTorch tensor
y_true = torch.cat([batch.y for batch in test_data_batches])

# Calculate the mean squared error (MSE) loss using PyTorch's function
mse_loss = torch.nn.functional.mse_loss(predictions.view(-1), y_true.view(-1))

# Calculate the R-squared (coefficient of determination) using scikit-learn's function
r2 = r2_score(y_true.view(-1).numpy(), predictions.view(-1).numpy())

print(f"Mean Squared Error (MSE) Loss: {mse_loss:.4f}")
print(f"R-squared (Accuracy): {r2:.4f}")
