In [1]:
import os 
import numpy as np 
import pandas as pd 
import scipy.sparse as sp
import torch
from torch_geometric.data import Data
from sklearn.neighbors import kneighbors_graph
import joblib 
from tqdm import tqdm 

data_root = '../data/Chang'
exp_type = 'normalized_exp'
exp_path = os.path.join(data_root,exp_type+'.csv')

clinical_path = '../data/Chang/clinical.xlsx'

In [2]:

df = pd.read_csv(exp_path)
clinical = pd.read_excel(clinical_path)
clinical = clinical.set_index('PID')

In [3]:
clinical

Unnamed: 0_level_0,Prognosis,is_Rec,RFS_status,RFS_time,RFS_liver_status,RFS_liver_time,Recurrence_site,zs_rec_riskmodel,fong_score,Gender,...,CEA,CEA_200_0,CEA_30_0,CA199,CA199_200_0,MSI,Pathology,Differential_grad,T_grade,Lymph_grade
PID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
W25,1,No Rec,0,45.066667,0,45.066667,Liver,1,2,1,...,106.5549,0,1,334.437248,1,0,0,3,4,1
B2,0,Rec,1,17.833333,1,17.833333,Liver,0,5,1,...,955.1,1,1,14.7,0,0,0,3,4,1
B3,0,Rec,1,6.3,1,6.3,Liver,0,4,1,...,41.9,0,1,13.3,0,0,0,2,3,1
B4,0,Rec,1,12.666667,1,12.666667,Liver,0,3,1,...,18.9,0,0,17.3,0,0,0,2,3,1
B5,0,Rec,1,12.9,1,12.9,Liver,0,3,2,...,11.1,0,0,7.4,0,0,1,3,3,1
B6,0,Rec,1,8.966667,1,8.966667,Liver,0,3,1,...,12.1,0,0,83.1,0,0,0,3,4,1
B7,0,Rec,1,9.066667,1,9.066667,Liver,0,3,2,...,26.1,0,0,988.3,1,0,1,3,3,1
B8,0,Rec,1,6.4,1,6.4,Liver,0,3,2,...,2.0,0,0,139.4,0,0,1,3,3,1
B9,0,Rec,1,9.533333,1,9.533333,Liver,0,3,2,...,3.6,0,0,78.6,0,0,1,2,3,1
B10,0,Rec,1,6.466667,1,6.466667,Liver,0,2,1,...,3.2,0,0,36.2,0,0,0,2,3,1


In [4]:
def list_pos_to_h_w(list_pos):
    h_list = [float(xi[1:-1].split(',')[0]) for xi in list_pos]
    w_list = [float(xi[1:-1].split(',')[1]) for xi in list_pos]
    return h_list,w_list

def gen_edge_index_weight(pos_t):
    A = kneighbors_graph(list(pos_t.numpy()),n_neighbors=5,mode='connectivity',include_self=False)
    edge_index_temp = sp.coo_matrix(A)
    indices = np.vstack((edge_index_temp.row,edge_index_temp.col))
    edge_index_A = torch.LongTensor(indices)

    dist = pos_t[edge_index_A[0]]-pos_t[edge_index_A[1]] 
    dist = dist * dist
    dists2 = dist[:,0]+dist[:,1]
    alpha = -0.05 # Hyperparameter for edge weight 
    ed_weight = torch.exp(alpha*dists2)

    return edge_index_A,ed_weight

In [5]:
os.makedirs(os.path.join(data_root,exp_type),exist_ok=True)
os.makedirs(os.path.join(data_root,exp_type,'gnn_data'),exist_ok=True)
target_gnn_dir = os.path.join(data_root,exp_type,'gnn_data')

for si in tqdm(list(df.ID.unique())):
    dfi = df[df.ID==si]
    
    x_fea_numpy = dfi.iloc[:,:35].to_numpy() # expr columns 
    x_fea = torch.from_numpy(x_fea_numpy).float()
    hl,wl = list_pos_to_h_w(list(dfi.Position))
    pos_t = torch.stack([torch.Tensor(hl),torch.Tensor(wl)]).T
    edge_index_t,edge_weight_t = gen_edge_index_weight(pos_t)
    area_t = torch.from_numpy(dfi.Area.to_numpy()).float()
    cell_type = list(dfi.MajorType)
    cell_type_final = list(dfi.SubType)

    pi = si.split('_')[0]
    RFS_liver_status = clinical.loc[pi]['RFS_liver_status']
    KRAS_mutation_status = clinical.loc[pi]['KRAS_mutation']
    gnn_data = Data(x=x_fea, edge_index=edge_index_t, pos=pos_t, edge_weight=edge_weight_t.float(), area=area_t, cell_type_final=cell_type_final, cell_major=cell_type, RFS_status=RFS_liver_status, KRAS_mutation=KRAS_mutation_status)

    torch.save(gnn_data,os.path.join(target_gnn_dir,si+'.pkl'))

100%|██████████| 311/311 [00:51<00:00,  6.10it/s]


In [6]:
import random 
gnn_data_list = os.listdir(target_gnn_dir)
os.makedirs(os.path.join(data_root,exp_type,'label_and_fold'),exist_ok=True)
rec_label = {}
for gnn_name in gnn_data_list:

    if clinical.loc[gnn_name.split('_')[0]]['is_Rec'] == 'Rec':
        rec_label[gnn_name]=1
    elif clinical.loc[gnn_name.split('_')[0]]['is_Rec'] == 'No Rec':
        rec_label[gnn_name]=0

joblib.dump(rec_label,os.path.join(data_root,exp_type,'label_and_fold','response_label_dict.pkl'))

patients = list(set([p.split('_')[0] for p in gnn_data_list]))

cnt = 0
val_ratio = 0.2
leave_one_fold={}
for i in range(len(patients)):
    leave_one_fold['fold'+str(i)+'_train']=[]
    leave_one_fold['fold'+str(i)+'_val']=[]
    leave_one_fold['fold'+str(i)+'_test']=[]
for p in patients:
    val_patients = random.sample(list(set(patients)-set([p])),round(len(patients)*val_ratio))
    for r in gnn_data_list:
        if p+'_' in r:
            leave_one_fold['fold'+str(cnt)+'_test'].append(r)
    for val_p in val_patients:
        for r in gnn_data_list:
            if val_p+'_' in r:
                leave_one_fold['fold'+str(cnt)+'_val'].append(r)
    
    leave_one_fold['fold'+str(cnt)+'_train']=list(set(gnn_data_list)-set(leave_one_fold['fold'+str(cnt)+'_test'])-set(leave_one_fold['fold'+str(cnt)+'_val']))
    cnt+=1
joblib.dump(leave_one_fold,os.path.join(data_root,exp_type,'label_and_fold','leave_one_fold_for_response.pkl'))

['../data/Chang/normalized_exp/label_and_fold/leave_one_fold_for_response.pkl']