<a href="https://colab.research.google.com/github/cannin/gsoc_2023_pytorch_pathway_commons/blob/main/pyg/modelling_with_breast_cancer_data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Downloading Data

In [None]:
!wget https://zenodo.org/record/8232723/files/X_train.csv?download=1 -O X_train.csv

--2023-08-10 13:57:20--  https://zenodo.org/record/8232723/files/X_train.csv?download=1
Resolving zenodo.org (zenodo.org)... 188.185.124.72
Connecting to zenodo.org (zenodo.org)|188.185.124.72|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 44836667 (43M) [text/plain]
Saving to: ‘X_train.csv’

X_train.csv          19%[==>                 ]   8.22M   418KB/s    eta 85s    

In [None]:
!wget https://zenodo.org/record/8232723/files/X_test.csv?download=1 -O X_test.csv

In [None]:
!wget https://zenodo.org/record/8232723/files/y_test.csv?download=1 -O y_test.csv

In [None]:
!wget https://zenodo.org/record/8232723/files/y_train.csv?download=1 -O y_train.csv

In [None]:
!wget https://zenodo.org/record/8232723/files/X_val.csv?download=1 -O X_val.csv

In [None]:
!wget https://zenodo.org/record/8232723/files/y_val.csv?download=1 -O y_val.csv

In [None]:
!wget https://zenodo.org/record/8233085/files/edge_index.pt?download=1 -O edge_index.pt

In [None]:
!pip install torch-geometric

# Importing Data and Libraries

In [None]:
import pandas as pd
import numpy as np
import torch
from torch_geometric.data import Data
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using device:', device)

if device.type != 'cuda':
    raise SystemError('GPU device not found')

In [None]:
X_train = pd.read_csv("X_train.csv", index_col=0)
y_train = np.loadtxt("y_train.csv", delimiter=',')

X_test = pd.read_csv("X_test.csv", index_col=0)
y_test = np.loadtxt("y_test.csv", delimiter=',')

X_val = pd.read_csv("X_val.csv", index_col=0)
y_val = np.loadtxt("y_val.csv", delimiter=',')

In [None]:
edge_index = torch.load('edge_index.pt')

In [None]:
edge_index

In [None]:
edge_index.shape

In [None]:
X_train.shape, X_test.shape, X_val.shape

In [None]:
X_train = X_train.values
X_test = X_test.values
X_val = X_val.values

# Generating patient-specific graphs

In [None]:
num_patients_train = X_train.shape[0]
num_patients_test = X_test.shape[0]
num_patients_val = X_val.shape[0]

# Create patient-specific graphs for the training set
graphs_train = []
for i in range(num_patients_train):
    node_features = X_train[i]  # Node features for the i-th patient
    target = y_train[i]  # Target label for the i-th patient
    graph_train = (node_features, edge_index, target)
    graphs_train.append(graph_train)

# Create patient-specific graphs for the test set
graphs_test = []
for i in range(num_patients_test):
    node_features = X_test[i]  # Node features for the i-th patient
    target = y_test[i]  # Target label for the i-th patient
    graph_test = (node_features, edge_index, target)
    graphs_test.append(graph_test)

# Create patient-specific graphs for the test set
graphs_val = []
for i in range(num_patients_val):
    node_features = X_val[i]  # Node features for the i-th patient
    target = y_val[i]  # Target label for the i-th patient
    graph_val = (node_features, edge_index, target)
    graphs_val.append(graph_test)

In [None]:
# Check the number of patient-specific graphs
print(len(graphs_train))  # Should be 857
print(len(graphs_test))  # Should be 217
print(len(graphs_val))

In [None]:
# Access the patient-specific graph for a specific patient in the training set
patient_index = 0 # Index of the patient
node_features, edge_index, y = graphs_train[patient_index]
print(node_features)  # Node features for the specific patient
print(edge_index)  # Edge index for the specific patient
print(y)

In [None]:
# Access the patient-specific graph for a specific patient in the training set
patient_index = 15  # Index of the patient
node_features, edge_index, y = graphs_test[patient_index]
print(node_features)  # Node features for the specific patient
print(edge_index)  # Edge index for the specific patient
# print(y)

In [None]:
# Access the patient-specific graph for a specific patient in the training set
patient_index = 15  # Index of the patient
node_features, edge_index, y = graphs_val[patient_index]
print(node_features)  # Node features for the specific patient
print(edge_index)  # Edge index for the specific patient
# print(y)

# Converting List of Graphs to Data Objects

In [None]:
# Convert graphs_train to a list of Data objects
data_train = [Data(x=torch.tensor(graph[0].reshape(len(graphs_train[0][0]), 1)), edge_index=graph[1], y=torch.tensor(graph[2])) for graph in graphs_train]

# Convert graphs_test to a list of Data objects
data_test = [Data(x=torch.tensor(graph[0].reshape(len(graphs_test[0][0]), 1)), edge_index=graph[1], y=torch.tensor(graph[2])) for graph in graphs_test]

# Convert graphs_test to a list of Data objects
data_val = [Data(x=torch.tensor(graph[0].reshape(len(graphs_val[0][0]), 1)), edge_index=graph[1], y=torch.tensor(graph[2])) for graph in graphs_val]

In [None]:
# Access the attributes of a specific data object in the training set
sample = data_train[0]  # Get the first data object
print(sample)  # Print the data object

# Access the node features, edge indices, and target label
node_features = sample.x
edge_index = sample.edge_index
target = sample.y

print(node_features)  # Print the node features
print(edge_index)  # Print the edge indices
print(target)  # Print the target label

# Creating Train, Test and Validation Batches

In [None]:
from torch_geometric.loader import DataLoader

In [None]:
bs = 16
train_loader = DataLoader(data_train, batch_size=bs, shuffle=True)
test_loader = DataLoader(data_test, batch_size=bs, shuffle=False)
val_loader = DataLoader(data_val, batch_size=bs, shuffle=True)

for step, data in enumerate(train_loader):
    data = data.to(device)  # Move the batch of data to the device

    print('Training Batches: ')
    print(f'Step {step + 1}:')
    print('=======')
    print(f'Number of graphs in the current batch: {data.num_graphs}')
    print(data)
    print()

In [None]:
for step, data in enumerate(test_loader):
    data = data.to(device)
    print('Test Batches: ')
    print(f'Step {step + 1}:')
    print('=======')
    print(f'Number of graphs in the current batch: {data.num_graphs}')
    print(data)
    print()

In [None]:
for step, data in enumerate(val_loader):
    data = data.to(device)
    print('Val Batches: ')
    print(f'Step {step + 1}:')
    print('=======')
    print(f'Number of graphs in the current batch: {data.num_graphs}')
    print(data)
    print()

# Model Building and Evaluation

In [None]:
from torch.nn import Linear
import torch.nn.functional as F
from torch_geometric.nn import GCNConv
from torch_geometric.nn import global_mean_pool

GATConv

GraphNorm

Dropout for all layers

In [None]:
class GCN(torch.nn.Module):
    def __init__(self, hidden_channels, num_node_features):
        super(GCN, self).__init__()
        torch.manual_seed(12345)
        self.conv1 = GCNConv(num_node_features, hidden_channels)
        self.conv2 = GCNConv(hidden_channels, hidden_channels)
        self.conv3 = GCNConv(hidden_channels, hidden_channels)
        self.conv4 = GCNConv(hidden_channels, hidden_channels)
        self.lin = Linear(hidden_channels, 1)  # Regression output with 1 dimension

    def forward(self, x, edge_index, batch):
        # 1. Obtain node embeddings
        x = self.conv1(x, edge_index)
        x = x.relu()
        # x = F.dropout(x, p=0.2, training=self.training)
        x = self.conv2(x, edge_index)
        x = x.relu()
        # F.dropout(x, p=0.2, training=self.training)
        x = self.conv3(x, edge_index)
        x = self.conv4(x, edge_index)

        # 2. Readout layer
        x = global_mean_pool(x, batch)  # [batch_size, hidden_channels]

        # 3. Apply a final regression layer
        x = F.dropout(x, p=0.2, training=self.training)
        x = self.lin(x)

        return x.squeeze()  # Remove the extra dimension

In [None]:
import time

In [None]:
start_time = time.time()

model = GCN(hidden_channels=64, num_node_features=1)
criterion = torch.nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.0001)

model = model.to(device)  # Move the model to CUDA device
criterion = criterion.to(device)  # Move the criterion to CUDA device

model.double()  # Convert the model's parameters to Double type

num_epochs = 101  # Specify the number of epochs


# Initialize lists to store training and validation losses
train_losses = []
val_losses = []

for epoch in range(num_epochs):
    model.train()  # Set the model to train mode
    total_loss = 0

    for step, data in enumerate(train_loader):
        data = data.to(device)  # Move the batch of data to CUDA device

        optimizer.zero_grad()

        out = model(data.x.double(), data.edge_index, data.batch)
        loss = criterion(out, data.y.view(-1).double())
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    # average_loss = total_loss / (step + 1)
    average_loss = total_loss / len(train_loader)
    train_losses.append(average_loss)

    # Evaluate on the validation set
    model.eval()
    with torch.no_grad():
        val_loss = 0
        for data in val_loader:
            data = data.to(device)
            out = model(data.x.double(), data.edge_index, data.batch)
            loss = criterion(out, data.y.view(-1).double())
            val_loss += loss.item()

        average_val_loss = val_loss / len(val_loader)
        val_losses.append(average_val_loss)

        print(f'Epoch: {epoch:03d}, Train loss: {average_loss:.4f}, Validation Loss: {average_val_loss:.4f}')

# Calculate the elapsed time
elapsed_time = time.time() - start_time

print(f"Time used for training: {elapsed_time:.2f} seconds")

In [None]:
model.eval()  # Set the model to evaluation mode

predictions = []  # List to store the predicted outputs
test_losses = []
with torch.no_grad():
    test_loss = 0
    for data in test_loader:
        data = data.to(device)  # Move the batch of data to CUDA device
        out = model(data.x.double(), data.edge_index, data.batch)
        loss = criterion(out, data.y.view(-1).double())
        test_loss += loss.item()

        average_test_loss = test_loss / len(test_loader)
        test_losses.append(average_val_loss)

        predictions.append(out.cpu().detach().numpy())  # Convert the predictions to NumPy array

# Concatenate the predictions from multiple batches
predictions = np.concatenate(predictions)

# Print the predictions
print(predictions[:10])

In [None]:
# Visualize convergence results
plt.figure()
plt.plot(range(1, num_epochs + 1), train_losses, label='Training Loss')
plt.plot(range(1, num_epochs + 1), val_losses, label='Validation Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()
plt.title('Convergence of Model')
plt.show()

In [None]:
from sklearn.metrics import r2_score

# Convert the test data batches to a list of Data objects
test_data_batches = [
    Data(x=batch.x, edge_index=batch.edge_index, y=batch.y) for batch in test_loader
]

# Convert the predictions to PyTorch tensors
predictions = torch.tensor(predictions)

# Convert the ground truth labels of the test data to a PyTorch tensor
y_true = torch.cat([batch.y for batch in test_data_batches])

# Calculate the mean squared error (MSE) loss using PyTorch's function
mse_loss = torch.nn.functional.mse_loss(predictions.view(-1), y_true.view(-1))

print(f"Mean Squared Error (MSE) Loss: {mse_loss:.4f}")