# Patient Similarity Graph Using GNN (SAGEConv)


In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.spatial.distance import pdist, squareform

import os
import os.path as osp

import torch
import torch_geometric
# from torch_geometric.datasets import Planetoid
from torch_geometric.data import Data

import torch.nn.functional as F
from torch_geometric.nn import SAGEConv


os.environ['TORCH'] = torch.__version__
print(torch.__version__)
use_cuda_if_available = False

1.13.1


In [2]:
def dropping_cols(df, p=80):
    #1- count the number of NaN values in each column
    #2- calculate the percentage of NaN values in each column
    #3- get the list of columns to drop
    #4- drop the columns with more than 80% NaN values
    nan_counts = df.isna().sum()    
    nan_percentages = nan_counts / len(df) * 100 
    cols_to_drop = nan_percentages[nan_percentages > p].index.tolist()
    df = df.drop(cols_to_drop, axis=1)
    return df   

In [3]:
def split_mask(n, tr=0.8, vl=0.1, ts=0.1):
    import random
    train_size = int(n * tr)
    val_size = int(n * vl)
    test_size = int(n * ts)

    # Initialize the three lists
    train_list = torch.zeros(n, dtype=torch.bool)
    val_list   = torch.zeros(n, dtype=torch.bool)
    test_list  = torch.zeros(n, dtype=torch.bool)

    indices = [i for i in range(n)]
    random.shuffle(indices)

    for i in range(n):
        j = indices[i]
        if i <train_size:
            train_list[j] = torch.tensor(True)
        elif i>= train_size and i< train_size + val_size:
            val_list[j] = torch.tensor(True)
        elif i>=train_size + val_size:
            test_list[j] = torch.tensor(True)
    return train_list, val_list, test_list


# Reading Lung dataset

In [4]:
original_lung = pd.read_csv(f'data/raw/numerical.csv', index_col=0)
original_lung

Unnamed: 0,Lung_Ipsi_Already_Subtracted_V5Gy,LU_DVH_28,TotalNumberOfNotes,Lung_Contra_Already_Subtracted_V20Gy,Lung_Ipsi + Lung_Contra - Lung_Subtraction_Structure ((Lung_Ipsi + Lung_Contra) - Lung_Subtraction_Structure)_DMean,LungEsophagitisWithGrade,LU_DVH_24,NumberOfNotesWithToxicityInitialized,LU_DVH_25,LungPneumonitisTotal,...,Lung_Ipsi_Already_Subtracted_V20Gy,LungPneumonitisWithGrade,Lung_Ipsi_Already_Subtracted_DMean,Lung_Ipsi + Lung_Contra - Lung_Subtraction_Structure ((Lung_Ipsi + Lung_Contra) - Lung_Subtraction_Structure)_V5Gy,LungEsophagitisTotal,center_id,vha_id,cancer_type,Lung_Ipsi + Lung_Contra - Lung_Subtraction_Structure ((Lung_Ipsi + Lung_Contra) - Lung_Subtraction_Structure)_V20Gy,LU_DVH_23
0,,,8,,,0,,8,,0,...,,0,,,0,506,506-NSCLC-01,NSCLC,,
1,,6.677710,9,,14.487703,6,10.960474,9,63.618332,0,...,,0,,46.657006,6,506,506-NSCLC-02,NSCLC,28.126339,47.123823
2,,4.935014,10,,16.569222,6,5.663030,10,66.684741,0,...,,0,,48.884592,6,506,506-NSCLC-03,NSCLC,30.136868,47.800127
3,,5.636330,9,,14.532199,4,0.557558,9,57.980541,0,...,,0,,48.736981,4,506,506-NSCLC-04,NSCLC,22.573872,43.076979
4,,7.276093,10,,19.432468,5,5.375518,10,91.226344,0,...,,0,,72.688403,5,506,506-NSCLC-05,NSCLC,33.505503,62.573410
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
768,,1.298097,9,,11.977461,0,0.268814,9,53.720835,0,...,,0,,29.844577,0,632,632-NSCLC-06,NSCLC,21.028291,39.040868
769,,4.887026,4,,16.379336,0,3.171784,4,74.574599,0,...,,0,,52.029705,0,667,667-NSCLC-08,NSCLC,27.637646,59.612198
770,,6.826124,6,,16.608732,0,5.895222,6,70.892367,0,...,,0,,56.074164,0,671,671-NSCLC-08,NSCLC,27.100541,59.570063
771,,3.590745,6,,12.384861,0,0.080024,6,78.265304,0,...,,0,,50.171829,0,671,671-SCLC-06,SCLC,22.719697,43.918297


In [5]:
original_features = list(original_lung.columns)

new_features      = [f'F{i}' for i in range(len(original_features))]
features_dict     = {new_features[i]: list(original_features)[i] for i in range(len(original_features))}

Lung = original_lung
Lung = Lung.rename(columns=dict(zip(original_features, new_features)))
Lung = dropping_cols(Lung)

# Imputing the NaN values to the mean
features_to_impute = [i for i in list(Lung.columns) if i not in ['F11', 'F20','F21','F22']]
# print(features_to_impute)
# Lung[features_to_impute] = Lung[features_to_impute].fillna(Lung[features_to_impute].mean())
Lung[features_to_impute] = Lung[features_to_impute].fillna(0)
# Lung.to_csv('data/raw/Lung.csv')

### 1- creting the data.X

In [6]:
df = Lung[features_to_impute]
# df = df.mul(100).round().astype(int)
X = torch.tensor(df.values)

num_classes  = X.shape[0]
num_features = X.shape[1]

print(X.shape)

tensors = []
for row in X:
    temp = []
    for col in row:
        temp.append(torch.tensor(col))
    tensors.append(torch.tensor(temp))

X = torch.stack(tensors)
X[0]
X = X.to(torch.float32)

torch.Size([773, 15])


  temp.append(torch.tensor(col))


In [7]:
df

Unnamed: 0,F1,F2,F4,F5,F6,F7,F8,F9,F12,F13,F16,F18,F19,F23,F24
0,0.000000,8,0.000000,0,0.000000,8,0.000000,0,0.000000,0.000000,0,0.000000,0,0.000000,0.000000
1,6.677710,9,14.487703,6,10.960474,9,63.618332,0,23.131032,31.330984,0,46.657006,6,28.126339,47.123823
2,4.935014,10,16.569222,6,5.663030,10,66.684741,0,24.965864,24.221081,0,48.884592,6,30.136868,47.800127
3,5.636330,9,14.532199,4,0.557558,9,57.980541,0,22.816645,38.811212,0,48.736981,4,22.573872,43.076979
4,7.276093,10,19.432468,5,5.375518,10,91.226344,0,31.994166,54.748618,0,72.688403,5,33.505503,62.573410
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
768,1.298097,9,11.977461,0,0.268814,9,53.720835,0,21.243729,2.327215,0,29.844577,0,21.028291,39.040868
769,4.887026,4,16.379336,0,3.171784,4,74.574599,0,31.398692,34.779103,0,52.029705,0,27.637646,59.612198
770,6.826124,6,16.608732,0,5.895222,6,70.892367,0,31.587831,46.396635,0,56.074164,0,27.100541,59.570063
771,3.590745,6,12.384861,0,0.080024,6,78.265304,0,20.619209,20.168570,0,50.171829,0,22.719697,43.918297


### 2- Creating data.edge_index
+ finding the similarity matrix SM of the given datafram 
+ convert the similarity matrix to edge_list

SM = pd.DataFrame(1/(1 + squareform(pdist(df, 'euclidean'))), index=df.index, columns=df.index).values
SM

In [9]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
df_scaled = pd.DataFrame(scaler.fit_transform(df))
df_scaled

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
0,0.000000,0.296296,0.000000,0.000000,0.000000,0.347826,0.000000,0.0,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000
1,0.230338,0.333333,0.488530,1.000000,0.194219,0.391304,0.636183,0.0,0.423883,0.337949,0.0,0.487912,0.857143,0.470905,0.515981
2,0.170227,0.370370,0.558720,1.000000,0.100348,0.434783,0.666847,0.0,0.457506,0.261259,0.0,0.511206,0.857143,0.504566,0.523387
3,0.194417,0.333333,0.490031,0.666667,0.009880,0.391304,0.579805,0.0,0.418121,0.418634,0.0,0.509663,0.571429,0.377943,0.471671
4,0.250979,0.370370,0.655269,0.833333,0.095254,0.434783,0.912263,0.0,0.586302,0.590542,0.0,0.760133,0.714286,0.560966,0.685146
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
768,0.044776,0.333333,0.403884,0.000000,0.004763,0.391304,0.537208,0.0,0.389297,0.025102,0.0,0.312097,0.000000,0.352066,0.427477
769,0.168571,0.148148,0.552317,0.000000,0.056204,0.173913,0.745746,0.0,0.575390,0.375142,0.0,0.544096,0.000000,0.462723,0.652723
770,0.235458,0.222222,0.560052,0.000000,0.104463,0.260870,0.708924,0.0,0.578856,0.500454,0.0,0.586391,0.000000,0.453731,0.652261
771,0.123858,0.222222,0.417622,0.000000,0.001418,0.260870,0.782653,0.0,0.377853,0.217547,0.0,0.524668,0.000000,0.380384,0.480883


## Finding the similarity matrix

In [14]:
def sim(X, Y):
    t, s = 0, 0
    for i in range(len(X)):
        if X[i]>0 and Y[i]>0:
            t=t+1
            s = s + (X[i]-Y[i])**2
    return  (1-s**0.5) * t/len(X)

SM = df_scaled.apply(lambda row1: df_scaled.apply(lambda row2: sim(row1, row2), axis=1), axis=1)

print(SM)


          0         1         2         3         4         5         6    \
0    0.133333  0.125718  0.118103  0.125718  0.118103  0.125718  0.125718   
1    0.125718  0.866667  0.715815  0.434359  0.341846  0.148730  0.406350   
2    0.118103  0.715815  0.866667  0.422972  0.360864  0.071639  0.355738   
3    0.125718  0.434359  0.422972  0.866667  0.318943  0.364663  0.446542   
4    0.118103  0.341846  0.360864  0.318943  0.866667 -0.020569  0.525741   
..        ...       ...       ...       ...       ...       ...       ...   
768  0.125718  0.374656  0.404199  0.382399  0.027224  0.179827  0.083234   
769  0.102872  0.433878  0.429129  0.419144  0.353718  0.137415  0.337075   
770  0.118103  0.472162  0.444598  0.460579  0.445673  0.201823  0.455234   
771  0.118103  0.467160  0.476023  0.475963  0.234458  0.191200  0.286395   
772  0.118103  0.314381  0.291168  0.353855 -0.058569  0.291890  0.082411   

          7         8         9    ...       763       764       765  \
0  

In [15]:
def get_edge_index(SM, th=0):
    '''
    SM: similarity matrix,
    th: threshold for edge weight,
    return edge_index'''
    source = []
    target = []
    weight = []
    for i in range(SM.shape[0]):
        for j in range(SM.shape[1]):
            if SM[i,j]> th:
                source.append(i)
                target.append(j)
                weight.append(SM[i,j])

    return torch.tensor([source, target])

edge_index = get_edge_index(SM.values)
# total_edge = {t/100: get_edge_index(SM, t/100).shape[1] for t in range(0, 50)}
# total_edge    

### 3- Creating data.Y

In [16]:
v = {'NSCLC'        : 0,
     'NSCLC Surgery': 1,
     'SCLC'         : 2}
Y = torch.tensor([v[i] for i in list(Lung['F22'])])

print(list(Lung['F22']).count('NSCLC')/773)
print(list(Lung['F22']).count('NSCLC Surgery')/773)
print(list(Lung['F22']).count('SCLC')/773)


Y.shape

0.7076326002587322
0.08408796895213454
0.20827943078913325


torch.Size([773])

### 4- Creating the different masks

In [32]:
tr_mask, v_mask, ts_mask = split_mask(X.shape[0], 0.7,0.1,0.2)

### 5- Creating the data object

In [33]:
data = Data(x=X, edge_index = edge_index, y = Y, train_mask = tr_mask, val_mask = v_mask, test_mask = ts_mask)

print(num_features,num_classes,)
data

15 773


Data(x=[773, 15], edge_index=[2, 560233], y=[773], train_mask=[773], val_mask=[773], test_mask=[773])

# GNN section

In [34]:
class Net(torch.nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.conv = SAGEConv(num_features,
                             num_classes,
                             aggr="max") # max, mean, add ...)

    def forward(self):
        x = self.conv(data.x, data.edge_index)
        return F.log_softmax(x, dim=1)
    

In [40]:
device = torch.device('cuda' if torch.cuda.is_available() and use_cuda_if_available else 'cpu')
model, data = Net().to(device), data.to(device)
# optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)
optimizer = torch.optim.SGD(model.parameters(), lr=0.01, momentum=0.9)

In [41]:
device

device(type='cpu')

In [42]:
def train():
    model.train()
    optimizer.zero_grad()
    F.nll_loss(model()[data.train_mask], data.y[data.train_mask]).backward()
    optimizer.step()


def test():
    model.eval()
    logits, accs = model(), []
    for _, mask in data('train_mask', 'val_mask', 'test_mask'):
        pred = logits[mask].max(1)[1]
        acc = pred.eq(data.y[mask]).sum().item() / mask.sum().item()
        accs.append(acc)
    return accs

In [43]:
best_val_acc = test_acc = 0
for epoch in range(1,500):
    train()
    _, val_acc, tmp_test_acc = test()
    if val_acc > best_val_acc:
        best_val_acc = val_acc
        test_acc = tmp_test_acc
    log = 'Epoch: {:03d}, Val: {:.4f}, Test: {:.4f}'
    
    if epoch % 10 == 0:
        print(log.format(epoch, best_val_acc, test_acc))

Epoch: 010, Val: 0.7143, Test: 0.7226
Epoch: 020, Val: 0.7143, Test: 0.7226
Epoch: 030, Val: 0.7143, Test: 0.7226
Epoch: 040, Val: 0.7143, Test: 0.7226
Epoch: 050, Val: 0.7143, Test: 0.7226
Epoch: 060, Val: 0.7143, Test: 0.7226
Epoch: 070, Val: 0.7143, Test: 0.7226
Epoch: 080, Val: 0.7143, Test: 0.7226
Epoch: 090, Val: 0.7143, Test: 0.7226
Epoch: 100, Val: 0.7143, Test: 0.7226
Epoch: 110, Val: 0.7143, Test: 0.7226
Epoch: 120, Val: 0.7143, Test: 0.7226
Epoch: 130, Val: 0.7143, Test: 0.7226
Epoch: 140, Val: 0.7143, Test: 0.7226
Epoch: 150, Val: 0.7143, Test: 0.7226
Epoch: 160, Val: 0.7143, Test: 0.7226
Epoch: 170, Val: 0.7143, Test: 0.7226
Epoch: 180, Val: 0.7143, Test: 0.7226
Epoch: 190, Val: 0.7273, Test: 0.7161
Epoch: 200, Val: 0.7273, Test: 0.7161
Epoch: 210, Val: 0.7273, Test: 0.7161
Epoch: 220, Val: 0.7273, Test: 0.7161
Epoch: 230, Val: 0.7273, Test: 0.7161
Epoch: 240, Val: 0.7273, Test: 0.7161
Epoch: 250, Val: 0.7273, Test: 0.7161
Epoch: 260, Val: 0.7273, Test: 0.7161
Epoch: 270, 