# Patient Similarity Graph Using GNN (SAGEConv)


In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.spatial.distance import pdist, squareform

import os
import torch
os.environ['TORCH'] = torch.__version__
print(torch.__version__)
import torch_geometric
from torch_geometric.datasets import Planetoid
use_cuda_if_available = False

1.13.1+cpu


In [2]:
def dropping_cols(df, p=80):
    #1- count the number of NaN values in each column
    #2- calculate the percentage of NaN values in each column
    #3- get the list of columns to drop
    #4- drop the columns with more than 80% NaN values
    nan_counts = df.isna().sum()    
    nan_percentages = nan_counts / len(df) * 100 
    cols_to_drop = nan_percentages[nan_percentages > p].index.tolist()
    df = df.drop(cols_to_drop, axis=1)
    return df   

In [3]:
def split_mask(n, tr=0.8, vl=0.1, ts=0.1):
    import random
    train_size = int(n * tr)
    val_size = int(n * vl)
    test_size = int(n * ts)

    # Initialize the three lists
    train_list = torch.zeros(n, dtype=torch.bool)
    val_list   = torch.zeros(n, dtype=torch.bool)
    test_list  = torch.zeros(n, dtype=torch.bool)

    indices = [i for i in range(n)]
    random.shuffle(indices)

    for i in range(n):
        j = indices[i]
        if i <train_size:
            train_list[j] = torch.tensor(True)
        elif i>= train_size and i< train_size + val_size:
            val_list[j] = torch.tensor(True)
        elif i>=train_size + val_size:
            test_list[j] = torch.tensor(True)
    return train_list, val_list, test_list


# Creating the data using Pytorch Geometric 

In [52]:
import os.path as osp

import torch
from torch_geometric.data import Dataset, download_url


class MyOwnDataset(Dataset):
    def __init__(self, root, transform=None, pre_transform=None, pre_filter=None):
        self.root = root
        super(MyOwnDataset, self).__init__(root, transform, pre_transform, pre_filter)
        # Call process to read the data file
        self.process()
        
    def __len__(self):
        # Return the number of data samples in your dataset
        return len(self.data)
    
    def __getitem__(self, idx):
        # Load and preprocess the data sample at the given index
        data = self.data[idx]
        # ...
        return data

    def process(self):
        self.data = pd.read_csv(self.root)
        X = self._get_node_features(self.data)
        print(X)
        
        
        
    def _get_node_features(self, L):
        '''This will return a 2d matrix of [number of nodes X number of features]'''
        features_to_impute = [i for i in list(L.columns) if i not in ['F11', 'F20','F21','F22']]
        df = L[features_to_impute]

        tensors = []
        for row in df.values:
            temp = []
            for col in row:
                temp.append(col)
            tensors.append(temp)
        return torch.tensor(np.asarray(tensors))

   

In [53]:
myData = MyOwnDataset(root = 'data/')

TypeError: Can't instantiate abstract class MyOwnDataset with abstract methods get, len

In [54]:
myData

TypeError: 'NoneType' object cannot be interpreted as an integer

# Reading Lung dataset

In [5]:
path = 'E:\VCU 2023\PSN Patient Similarity Network\GraphAugmentation'
original_lung = pd.read_csv(f'{path}/data/Lung/numerical.csv', index_col=0)

In [6]:
original_features = list(original_lung.columns)

new_features      = [f'F{i}' for i in range(len(original_features))]
features_dict     = {new_features[i]: list(original_features)[i] for i in range(len(original_features))}

Lung = original_lung
Lung = Lung.rename(columns=dict(zip(original_features, new_features)))
Lung = dropping_cols(Lung)

# Imputing the NaN values to the mean
features_to_impute = [i for i in list(Lung.columns) if i not in ['F11', 'F20','F21','F22']]
print(features_to_impute)
Lung[features_to_impute] = Lung[features_to_impute].fillna(Lung[features_to_impute].mean())


['F1', 'F2', 'F4', 'F5', 'F6', 'F7', 'F8', 'F9', 'F12', 'F13', 'F16', 'F18', 'F19', 'F23', 'F24']


### 1- creting the data.X

In [7]:

num_classes  = X.shape[0]
num_features = X.shape[1]


X[0]

NameError: name 'X' is not defined

### 2- Creating data.edge_index
+ finding the similarity matrix SM of the given datafram 
+ convert the similarity matrix to edge_list

In [None]:
SM = pd.DataFrame(1/(1 + squareform(pdist(df, 'euclidean'))), index=df.index, columns=df.index).values
SM

In [None]:
source = []
target = []
weight = []

similarity_threshold = 0

for i in range(SM.shape[0]):
    for j in range(SM.shape[1]):
        if SM[i,j]> similarity_threshold:
            source.append(i)
            target.append(j)
            weight.append(SM[i,j])

edge_index = torch.tensor([source, target])
edge_index.shape


### 3- Creating data.Y

In [None]:
v = {'NSCLC'        : 0,
     'NSCLC Surgery': 1,
     'SCLC'         : 2}
Y = torch.tensor([v[i] for i in list(Lung['F22'])])
Y.shape

### 4- Creating the different masks

In [None]:
tr_mask, v_mask, ts_mask = split_mask(X.shape[0])

### 5- Creating the data object

In [None]:
from torch_geometric.data import Data
data = Data(x=X, edge_index = edge_index, y = Y, train_mask = tr_mask, val_mask = v_mask, test_mask = ts_mask)

print(num_features,num_classes,)
data

# GNN section

In [None]:
import os.path as osp

import torch
import torch.nn.functional as F
from torch_geometric.nn import SAGEConv

In [None]:
class Net(torch.nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.conv = SAGEConv(num_features,
                             num_classes,
                             aggr="max") # max, mean, add ...)

    def forward(self):
        _ = input(f'{data.x.shape}\n{data.edge_index.shape}')
        x = self.conv(data.x, data.edge_index)
        return F.log_softmax(x, dim=1)
    

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() and use_cuda_if_available else 'cpu')
model, data = Net().to(device), data.to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)

In [None]:
device

In [None]:
def train():
    model.train()
    optimizer.zero_grad()
    F.nll_loss(model()[data.train_mask], data.y[data.train_mask]).backward()
    optimizer.step()


def test():
    model.eval()
    logits, accs = model(), []
    for _, mask in data('train_mask', 'val_mask', 'test_mask'):
        pred = logits[mask].max(1)[1]
        acc = pred.eq(data.y[mask]).sum().item() / mask.sum().item()
        accs.append(acc)
    return accs

In [None]:
best_val_acc = test_acc = 0
for epoch in range(1,100):
    train()
    _, val_acc, tmp_test_acc = test()
    if val_acc > best_val_acc:
        best_val_acc = val_acc
        test_acc = tmp_test_acc
    log = 'Epoch: {:03d}, Val: {:.4f}, Test: {:.4f}'
    
    if epoch % 10 == 0:
        print(log.format(epoch, best_val_acc, test_acc))

In [None]:
data.x

In [None]:
import torch
from torch.nn import Linear, ReLU
from torch_geometric.nn import SAGEConv
# from torch_geometric.datasets import Planetoid
import torch.nn.functional as F

# Define the dataset
# dataset = Planetoid(root='/tmp/Cora', name='Cora')

# Define the model
class Net(torch.nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.conv1 = SAGEConv(num_features, 16)
        self.conv2 = SAGEConv(16, num_classes)
        self.lin1 = Linear(num_classes, 32)
        self.lin2 = Linear(32, num_classes)
        
    def forward(self, x, edge_index):
        x = F.relu(self.conv1(x, edge_index))
        x = F.dropout(x, training=self.training)
        x = F.relu(self.conv2(x, edge_index))
        x = F.dropout(x, training=self.training)
        x = F.relu(self.lin1(x))
        x = self.lin2(x)
        return F.log_softmax(x, dim=1)

# Initialize the model and define the optimizer
model = Net()
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

# Train the model
def train():
    model.train()
    optimizer.zero_grad()
    out = model(data.x, data.edge_index)
    loss = F.nll_loss(out, data.y)
    loss.backward()
    optimizer.step()

# Evaluate the model
def test():
    model.eval()
    out = model(data.x, data.edge_index)
    pred = out.argmax(dim=1)
    acc = pred.eq(data.y).sum().item() / len(data.y)
    return acc

for epoch in range(1, 201):
    train()
    if epoch % 10 == 0:
        acc = test()
        print(f'Epoch: {epoch:03d}, Test Acc: {acc:.4f}')


In [None]:
dataset[0]