# Generating the data
1. Generate the Heterogeneous graph
2. Generate the feature set from the clinical notes.
3. Generate the Labels
4. Generate the k-metapath-based similarity matrices
5. Convert the As to edge-based.

In [14]:
class heterogeneous_Graph:
    def __init__(self, G):
        self.HG = G
        Nodes = list(self.HG.nodes())
        self.Patients =    [v for v in Nodes if v[0]=='C']
        self.Visits =      [v for v in Nodes if v[0]=='V']
        self.Medications = [v for v in Nodes if v[0]=='M']
        self.Diagnoses  =  [v for v in Nodes if v[0]=='D']
        self.Procedures =  [v for v in Nodes if v[0]=='P']
        self.Labs       =  [v for v in Nodes if v[0]=='L']
        self.MicroBio   =  [v for v in Nodes if v[0]=='B']
        self.Nodes = self.Patients  + self.Visits + self.Medications + self.Diagnoses + self.Procedures + self.Labs + self.MicroBio
        


In [2]:
import os
import torch
import networkx as nx
import random


def save_list_as_pickle(L, given_path, file_name):
    import pickle
    print(f'saving to {given_path}/{file_name}.pkl')
    with open(f'{given_path}/{file_name}.pkl', 'wb') as file:
        pickle.dump(L, file)


num_Diseases    = int(os.getenv('NUM_DISEASES', 203))  
DISEASE_FILE    = os.getenv('DISEASE_FILE', f'DMPLB2')  
similarity_type = os.getenv('similarity_type', 'PC')  # options are PC: PathCount, SPS: Symmetric PathSim

num_Sample      = int(os.getenv('num_Sample', 500))  
r_u_sampling    = os.getenv('r_u_sampling', 'True')  
SNF_ing         = os.getenv('SNF_ing', 'True')  


if r_u_sampling=='True':
    sampling = True
else:
    sampling = False

if SNF_ing=='True':
    SNF_ing = True
else:
    SNF_ing = False

disease_data_path = '/lustre/home/almusawiaf/PhD_Projects/HGNN_Project2/Data'

print(num_Diseases, DISEASE_FILE, similarity_type, num_Sample, sampling)


203 DMPLB2 PC 500 True


### Generating complete HG from scratch

In [None]:
# from module1 import generating_HG as gHG
# HG_inst = gHG.Generate_HG()
# nx.write_gml(HG_inst.HG, f'{disease_data_path}/{num_Diseases}_Diagnoses/complete_HG.gml')
# gHG.G_statistics(HG_inst.HG)
# # ======================To sample or not to sample, that is the question =========================
# if not sampling:
#     num_Sample = len(HG_inst.Patients)
#     HG = HG_inst.HG
# else:
#     patients_to_remove = random.sample(HG_inst.Patients, len(HG_inst.Patients) - num_Sample)
#     print(len(patients_to_remove), num_Sample, len(HG_inst.Patients))
    
#     # deleting the nodes
#     HG = gHG.remove_patients_and_linked_visits(patients_to_remove, HG_inst.HG)
# # =================================================================================

In [11]:
from module1 import XY_preparation as XY
from module1 import meta_path as MP
from module1 import reduction as Red
# =================================================================================
saving_path = f'{disease_data_path}/{num_Diseases}_Diagnoses/{DISEASE_FILE}/{num_Sample}'

for p in ['HGNN_data', 'clinical_items', 'GMLs', 'OHV', 'PSGs', 'SNFs']:
    os.makedirs(f'{saving_path}/{p}', exist_ok=True)

saving_path = f'{disease_data_path}/{num_Diseases}_Diagnoses/{DISEASE_FILE}/{num_Sample}/HGNN_data'
# =================================================================================

complete_HG = nx.read_gml(f'{disease_data_path}/{num_Diseases}_Diagnoses/complete_HG.gml')


## Whole graph or Sample graph

In [15]:
HG_obj = heterogeneous_Graph(complete_HG)

gHG.G_statistics(HG_obj.HG)

# ======================To sample or not to sample, that is the question =========================
if not sampling:
    num_Sample = len(HG_obj.Patients)
    HG = HG_obj.HG
else:
    patients_to_remove = random.sample(HG_obj.Patients, len(HG_obj.Patients) - num_Sample)
    print(len(patients_to_remove), num_Sample, len(HG_obj.Patients))
    
    # deleting the nodes
    HG = gHG.remove_patients_and_linked_visits(patients_to_remove, HG_obj.HG)

number of patients = 46437
number of visits = 58929
number of Medication = 592
number of Diagnoses = 203
number of Procedures = 89
number of Labs = 480
number of MicoBio = 64
number of Edges = 5336561
------------------------------------------

45937 500 46437
Number of PATIENTS to remove: 45937
Number of nodes to remove: 104125


## Extracting the features

In [16]:
# ============================ Extracting Patient-based X and Y =================================
XY_inst = XY.XY_preparation(HG)
X = XY_inst.X
Y = XY_inst.Y
# ============================ Extracting Visit-based X and Y =================================
XY_inst = XY.XY_preparation(HG)
XV = XY_inst.X_visit
YV = XY_inst.Y_visit
# ==================================== Saving X and Y  (patient-based) ============================
torch.save(X, f'{saving_path}/X.pt')
torch.save(Y, f'{saving_path}/Y.pt')
# ==================================== Saving X and Y (visit-based) =================================
torch.save(X, f'{saving_path}/XV.pt')
torch.save(Y, f'{saving_path}/YV.pt')


getting the feature set for all nodes
getting the feature set for all nodes: visit_level
getting the feature set for all nodes
getting the feature set for all nodes: visit_level


## Meta-path Similarities

In [17]:
# ======================= Computing the Meta Path based Similarities ======================
MP_inst = MP.Meta_path(HG, similarity_type = 'PC', saving_path = saving_path)
# ==================================== SAVING =============================================
nx.write_gml(HG, f'{saving_path}/HG.gml')
save_list_as_pickle(MP_inst.Nodes,   saving_path, 'Nodes')


extracting As from HG

[(2669, 2669), (2669, 2669), (2669, 2669), (2669, 2669), (2669, 2669), (2669, 2669)]
Multiplication phase...

multiplying (2669, 2669) * (2669, 2669) in parallel...
Done multiplication...
multiplying (2669, 2669) * (2669, 2669) in parallel...
Done multiplication...
multiplying (2669, 2669) * (2669, 2669) in parallel...
Done multiplication...
multiplying (2669, 2669) * (2669, 2669) in parallel...
Done multiplication...
multiplying (2669, 2669) * (2669, 2669) in parallel...
Done multiplication...
multiplying (2669, 2669) * (2669, 2669) in parallel...
Done multiplication...
multiplying (2669, 2669) * (2669, 2669) in parallel...
Done multiplication...
multiplying (2669, 2669) * (2669, 2669) in parallel...
Done multiplication...
multiplying (2669, 2669) * (2669, 2669) in parallel...
Done multiplication...
multiplying (2669, 2669) * (2669, 2669) in parallel...
Done multiplication...
multiplying (2669, 2669) * (2669, 2669) in parallel...
Done multiplication...
multiplyi

## Reduction

In [18]:
reduction_obj = Reduction(saving_path)

Matrix 0: 0 non-zero elements
Matrix 0 has zero values. Not saving...
Matrix 1: 59528 non-zero elements
	Saving all non-zero values... (59528 non-zero elements)
Matrix 2: 52306 non-zero elements
	Saving all non-zero values... (52306 non-zero elements)
Matrix 3: 121107 non-zero elements
	Saving all non-zero values... (121107 non-zero elements)
Matrix 4: 6217 non-zero elements
	Saving all non-zero values... (6217 non-zero elements)
Matrix 5: 0 non-zero elements
Matrix 5 has zero values. Not saving...
Matrix 6: 100953 non-zero elements
	Saving all non-zero values... (100953 non-zero elements)
Matrix 7: 81974 non-zero elements
	Saving all non-zero values... (81974 non-zero elements)
Matrix 8: 336302 non-zero elements
	Saving all non-zero values... (336302 non-zero elements)
Matrix 9: 9381 non-zero elements
	Saving all non-zero values... (9381 non-zero elements)
Matrix 10: 0 non-zero elements
Matrix 10 has zero values. Not saving...
Matrix 11: 20273 non-zero elements
	Saving all non-zero va