# Patient Similarity Graph Using GNN


In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.spatial.distance import pdist, squareform

import os
import torch
os.environ['TORCH'] = torch.__version__
print(torch.__version__)
import torch_geometric
from torch_geometric.datasets import Planetoid
use_cuda_if_available = False

2.0.0+cpu


In [2]:
def dropping_cols(df, p=80):
    #1- count the number of NaN values in each column
    #2- calculate the percentage of NaN values in each column
    #3- get the list of columns to drop
    #4- drop the columns with more than 80% NaN values
    nan_counts = df.isna().sum()    
    nan_percentages = nan_counts / len(df) * 100 
    cols_to_drop = nan_percentages[nan_percentages > p].index.tolist()
    df = df.drop(cols_to_drop, axis=1)
    return df   

# Reading Lung dataset

In [3]:
path = 'E:\VCU 2023\PSN Patient Similarity Network\GraphAugmentation'
original_lung = pd.read_csv(f'{path}/data/Lung/numerical.csv', index_col=0)

In [4]:
original_features = list(original_lung.columns)

new_features      = [f'F{i}' for i in range(len(original_features))]
features_dict     = {new_features[i]: list(original_features)[i] for i in range(len(original_features))}

Lung = original_lung
Lung = Lung.rename(columns=dict(zip(original_features, new_features)))
Lung = dropping_cols(Lung)

# Imputing the NaN values to the mean
features_to_impute = [i for i in list(Lung.columns) if i not in ['F11', 'F20','F21','F22']]
print(features_to_impute)
Lung[features_to_impute] = Lung[features_to_impute].fillna(Lung[features_to_impute].mean())


['F1', 'F2', 'F4', 'F5', 'F6', 'F7', 'F8', 'F9', 'F12', 'F13', 'F16', 'F18', 'F19', 'F23', 'F24']


In [5]:
df = Lung[features_to_impute]
df = df.mul(100).round().astype(int)
X = torch.tensor(df.values)
num_classes  = X.shape[0]
num_features = X.shape[1]
print(X.shape)
tensors = []
for row in X:
    temp = []
    for col in row:
        temp.append(torch.tensor(col))
    tensors.append(torch.tensor(temp))

X = torch.stack(tensors)
X[0]

torch.Size([773, 15])


  temp.append(torch.tensor(col))


tensor([ 715,  800, 1515,    0,  776,  800, 6887,    0, 2431, 4263,    0, 5506,
           0, 2642, 4778], dtype=torch.int32)

In [6]:
# finding the similarity matrix SM of the given datafram 
SM = pd.DataFrame(1/(1 + squareform(pdist(df, 'euclidean'))), index=df.index, columns=df.index).values
SM = (SM * 100).round().astype(int)
SM

array([[100,   0,   0, ...,   0,   0,   0],
       [  0, 100,   0, ...,   0,   0,   0],
       [  0,   0, 100, ...,   0,   0,   0],
       ...,
       [  0,   0,   0, ..., 100,   0,   0],
       [  0,   0,   0, ...,   0, 100,   0],
       [  0,   0,   0, ...,   0,   0, 100]])

In [7]:
source = []
target = []
weight = []

similarity_threshold = 0

for i in range(SM.shape[0]):
    for j in range(SM.shape[1]):
        if SM[i,j]> similarity_threshold:
            source.append(i)
            target.append(j)
            weight.append(SM[i,j])

edge_index = torch.tensor([source, target])
edge_index.shape


torch.Size([2, 2467])

In [8]:
v = {'NSCLC'        : 0,
     'NSCLC Surgery': 1,
     'SCLC'         : 2}
Y = torch.tensor([v[i] for i in list(Lung['F22'])])
Y.shape

torch.Size([773])

In [9]:
def split_mask(n, tr=0.8, vl=0.1, ts=0.1):
    import random
    train_size = int(n * tr)
    val_size = int(n * vl)
    test_size = int(n * ts)

    # Initialize the three lists
    train_list = torch.zeros(n, dtype=torch.bool)
    val_list   = torch.zeros(n, dtype=torch.bool)
    test_list  = torch.zeros(n, dtype=torch.bool)

    indices = [i for i in range(n)]
    random.shuffle(indices)

    for i in range(n):
        j = indices[i]
        if i <train_size:
            train_list[j] = torch.tensor(True)
        elif i>= train_size and i< train_size + val_size:
            val_list[j] = torch.tensor(True)
        elif i>=train_size + val_size:
            test_list[j] = torch.tensor(True)
    return train_list, val_list, test_list


In [10]:
from torch_geometric.data import Data
tr_mask, v_mask, ts_mask = split_mask(X.shape[0])
data = Data(x=X, edge_index = edge_index, y = Y, train_mask = tr_mask, val_mask = v_mask, test_mask = ts_mask)

In [11]:
# # print(type(dataset.data.train_mask))
# # print(type(dataset.data.val_mask))
# # print(type(dataset.data.test_mask))
# print(data.x)
# print("edge_index:\t\t",data.edge_index.shape)
# print(data.edge_index)
# print("\n")
# print("train_mask:\t\t",data.train_mask.shape)
# print(data.train_mask)
# print("\n")
# print("x:\t\t",data.x.shape)
# print(data.x)
# print("\n")
# print("y:\t\t",data.y.shape)
# print(data.y)
data

Data(x=[773, 15], edge_index=[2, 2467], y=[773], train_mask=[773], val_mask=[773], test_mask=[773])

In [12]:
import os.path as osp

import torch
import torch.nn.functional as F
from torch_geometric.nn import SAGEConv

In [23]:
class Net(torch.nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.conv = SAGEConv(num_features,
                             num_classes,
                             aggr="max") # max, mean, add ...)

    def forward(self):
        x = self.conv(data.x, data.edge_index)
        return F.log_softmax(x, dim=1)
    

In [24]:
device = torch.device('cuda' if torch.cuda.is_available() and use_cuda_if_available else 'cpu')
model, data = Net().to(device), data.to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)

In [25]:
device

device(type='cpu')

In [26]:
def train():
    model.train()
    optimizer.zero_grad()
    F.nll_loss(model()[data.train_mask], data.y[data.train_mask]).backward()
    optimizer.step()


def test():
    model.eval()
    logits, accs = model(), []
    for _, mask in data('train_mask', 'val_mask', 'test_mask'):
        pred = logits[mask].max(1)[1]
        acc = pred.eq(data.y[mask]).sum().item() / mask.sum().item()
        accs.append(acc)
    return accs

In [27]:
best_val_acc = test_acc = 0
for epoch in range(1,100):
    train()
    _, val_acc, tmp_test_acc = test()
    if val_acc > best_val_acc:
        best_val_acc = val_acc
        test_acc = tmp_test_acc
    log = 'Epoch: {:03d}, Val: {:.4f}, Test: {:.4f}'
    
    if epoch % 10 == 0:
        print(log.format(epoch, best_val_acc, test_acc))

torch.Size([773, 15])
tensor([[  0,   0,   0,  ..., 770, 771, 772],
        [  0,  41, 158,  ..., 770, 771, 772]])


RuntimeError: mat1 and mat2 must have the same dtype

In [None]:
train_list