<a href="https://colab.research.google.com/github/cannin/gsoc_2023_pytorch_pathway_commons/blob/main/PyG_sample_data_with_InMemoryDataset_function.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install torch-geometric

# Importing Data and Libraries

In [None]:
import pandas as pd
import numpy as np
import torch
from torch_geometric.data import Data
from sklearn.model_selection import train_test_split

In [None]:
from torch_geometric.data import InMemoryDataset, Data, download_url, extract_zip
import os

# Creating InMemoryDataset Class for Train Set

In [None]:
class Acc_train(InMemoryDataset):
  # Base url to download the files
    url = 'https://zenodo.org/record/8117485/files/Train%20set.zip?download=1'

    def __init__(self, root, transform=None, pre_transform=None, pre_filter=None):
      super().__init__(root, transform, pre_transform, pre_filter)
      self.data, self.slices = torch.load(self.processed_paths[0])


    @property
    def raw_file_names(self):
        # List of the raw files
        return ['X_train.csv', 'y_train.csv', 'edge_index.pt']

    @property
    def processed_file_names(self):
        return 'train_data.pt'

    def download(self):
        # Download the file specified in self.url and store
        # it in self.raw_dir
        path = download_url(self.url, self.raw_dir)
        extract_zip(path, self.raw_dir)
        # The zip file is removed
        os.unlink(path)

    def process(self):
        # Load X_train from CSV file with the index
        X_train = pd.read_csv(os.path.join(self.raw_dir,'Train set', 'X_train.csv'), index_col=0)

        # Load y_train from CSV file
        y_train = np.loadtxt(os.path.join(self.raw_dir,'Train set', 'y_train.csv'), delimiter=',')

        # Load the edge_index from the file
        file_path = os.path.join(self.raw_dir,'Train set', 'edge_index.pt')
        edge_index = torch.load(file_path)

        # Convert X_train to NumPy array
        X_train = X_train.values

        # Get the number of patients in the training set
        num_patients_train = X_train.shape[0]

        # Create patient-specific graphs for the training set
        graphs_train = []
        for i in range(num_patients_train):
            node_features = X_train[i]  # Node features for the i-th patient
            target = y_train[i]  # Target label for the i-th patient
            graph_train = (node_features, edge_index, target)
            graphs_train.append(graph_train)

        # Convert graphs_train to a list of Data objects
        data_train = [Data(x=torch.tensor(graph[0].reshape(len(graphs_train[0][0]), 1)),
                           edge_index=graph[1], y=torch.tensor(graph[2]).item()) for graph in graphs_train]


        data, slices = self.collate(data_train)
        # Save the processed data
        torch.save((data, slices), self.processed_paths[0])

In [None]:
df_tr = Acc_train(root='/content/sample_data')

Downloading https://zenodo.org/record/8117485/files/Train%20set.zip?download=1
Extracting /content/sample_data/raw/Train%20set.zip
Processing...
Done!


In [None]:
df_tr

Acc_train(54)

In [None]:
#Access the attributes of a specific data object in the training set
sample = df_tr[0]  # Get the first data object
print(sample)  # Print the data object

# Access the node features, edge indices, and target label
node_features = sample.x
edge_index = sample.edge_index
target = sample.y

print(node_features)  # Print the node features
print(edge_index)  # Print the edge indices
print(target)  # Print the target label

Data(x=[9288, 1], edge_index=[2, 271771], y=[1])
tensor([[    0.0000],
        [10373.7000],
        [    0.0000],
        ...,
        [  183.9580],
        [  146.9740],
        [  596.0620]], dtype=torch.float64)
tensor([[   0,    0,    0,  ..., 9287, 9287, 9287],
        [ 451,  452,  453,  ..., 3323, 3340, 3341]])
tensor([44.5475])


# Creating InMemoryDataset Class for Train Set

In [None]:
class Acc_test(InMemoryDataset):
  # Base url to download the files
    url = 'https://zenodo.org/record/8117744/files/Test%20set.zip?download=1'

    def __init__(self, root, transform=None, pre_transform=None, pre_filter=None):
      super().__init__(root, transform, pre_transform, pre_filter)
      self.data, self.slices = torch.load(self.processed_paths[0])


    @property
    def raw_file_names(self):
        # List of the raw files
        return ['X_test.csv', 'y_test.csv', 'edge_index.pt']

    @property
    def processed_file_names(self):
        return 'test_data.pt'

    def download(self):
        # Download the file specified in self.url and store
        # it in self.raw_dir
        path = download_url(self.url, self.raw_dir)
        extract_zip(path, self.raw_dir)
        # The zip file is removed
        os.unlink(path)

    def process(self):

        # Load X_train from CSV file with the index
        X_test = pd.read_csv(os.path.join(self.raw_dir,'Test set', 'X_test.csv'), index_col=0)

        # Load y_train from CSV file
        y_test = np.loadtxt(os.path.join(self.raw_dir,'Test set', 'y_test.csv'), delimiter=',')

        # Load the edge_index from the file
        file_path = os.path.join(self.raw_dir,'Test set', 'edge_index.pt')
        edge_index = torch.load(file_path)

        # Convert X_test to NumPy array
        X_test = X_test.values

        # Get the number of patients in the test set
        num_patients_test = X_test.shape[0]

        # Create patient-specific graphs for the training set
        graphs_test = []
        for i in range(num_patients_test):
            node_features = X_test[i]  # Node features for the i-th patient
            target = y_test[i]  # Target label for the i-th patient
            graph_test = (node_features, edge_index, target)
            graphs_test.append(graph_test)

        # Convert graphs_train to a list of Data objects
        data_test = [Data(x=torch.tensor(graph[0].reshape(len(graphs_test[0][0]), 1)),
                           edge_index=graph[1], y=torch.tensor(graph[2]).squeeze()) for graph in graphs_test]


        test_data, slices = self.collate(data_test)
        # Save the processed data
        torch.save((test_data, slices), self.processed_paths[0])

In [None]:
df_test = Acc_test(root='/content/sample_data')

Downloading https://zenodo.org/record/8117744/files/Test%20set.zip?download=1
Extracting /content/sample_data/raw/Test%20set.zip
Processing...
Done!


In [None]:
df_test

Acc_test(24)

In [None]:
#Access the attributes of a specific data object in the training set
sample = df_test[0]  # Get the first data object
print(sample)  # Print the data object

# Access the node features, edge indices, and target label
node_features = sample.x
edge_index = sample.edge_index
target = sample.y

print(node_features)  # Print the node features
print(edge_index)  # Print the edge indices
print(target)  # Print the target label

Data(x=[9288, 1], edge_index=[2, 271771], y=[1])
tensor([[    0.0000],
        [16494.5000],
        [    0.0000],
        ...,
        [   83.4073],
        [  106.4770],
        [  403.7270]], dtype=torch.float64)
tensor([[   0,    0,    0,  ..., 9287, 9287, 9287],
        [ 451,  452,  453,  ..., 3323, 3340, 3341]])
tensor([23.6381], dtype=torch.float64)
