# Generating the data
1. Generate the Heterogeneous graph
2. Generate the feature set from the clinical notes.
3. Generate the Labels
4. Generate the k-metapath-based similarity matrices
5. Convert the As to edge-based.

In [1]:
class heterogeneous_Graph:
    def __init__(self, G):
        self.HG = G
        Nodes = list(self.HG.nodes())
        self.Patients =    [v for v in Nodes if v[0]=='C']
        self.Visits =      [v for v in Nodes if v[0]=='V']
        self.Medications = [v for v in Nodes if v[0]=='M']
        self.Diagnoses  =  [v for v in Nodes if v[0]=='D']
        self.Procedures =  [v for v in Nodes if v[0]=='P']
        self.Labs       =  [v for v in Nodes if v[0]=='L']
        self.MicroBio   =  [v for v in Nodes if v[0]=='B']
        self.Nodes = self.Patients  + self.Visits + self.Medications + self.Diagnoses + self.Procedures + self.Labs + self.MicroBio
        


In [2]:
import os
import torch
import networkx as nx
import random


def save_list_as_pickle(L, given_path, file_name):
    import pickle
    print(f'saving to {given_path}/{file_name}.pkl')
    with open(f'{given_path}/{file_name}.pkl', 'wb') as file:
        pickle.dump(L, file)


num_Diseases    = int(os.getenv('NUM_DISEASES', 203))  
DISEASE_FILE    = os.getenv('DISEASE_FILE', f'DMPLB2')  
similarity_type = os.getenv('similarity_type', 'PC')  # options are PC: PathCount, SPS: Symmetric PathSim

num_Sample      = int(os.getenv('num_Sample', 250))  
r_u_sampling    = os.getenv('r_u_sampling', 'True')  
SNF_ing         = os.getenv('SNF_ing', 'True')  


if r_u_sampling=='True':
    sampling = True
else:
    sampling = False

if SNF_ing=='True':
    SNF_ing = True
else:
    SNF_ing = False

disease_data_path = '/lustre/home/almusawiaf/PhD_Projects/HGNN_Project2/Data'

print(num_Diseases, DISEASE_FILE, similarity_type, num_Sample, sampling)


203 DMPLB2 PC 500 True


In [3]:
# =================================================================================
saving_path = f'{disease_data_path}/{num_Diseases}_Diagnoses/{DISEASE_FILE}/{num_Sample}'

for p in ['HGNN_data', 'clinical_items', 'GMLs', 'OHV', 'PSGs', 'SNFs']:
    os.makedirs(f'{saving_path}/{p}', exist_ok=True)

saving_path = f'{disease_data_path}/{num_Diseases}_Diagnoses/{DISEASE_FILE}/{num_Sample}/HGNN_data'
os.makedirs(f'{saving_path}/As', exist_ok=True)
# =================================================================================

complete_HG = nx.read_gml(f'{disease_data_path}/{num_Diseases}_Diagnoses/complete_HG.gml')


### Generating complete HG from scratch

In [4]:
from module1 import generating_HG as gHG
# HG_inst = gHG.Generate_HG()
# nx.write_gml(HG_inst.HG, f'{disease_data_path}/{num_Diseases}_Diagnoses/complete_HG.gml')
# gHG.G_statistics(HG_inst.HG)
# # ======================To sample or not to sample, that is the question =========================
# if not sampling:
#     num_Sample = len(HG_inst.Patients)
#     HG = HG_inst.HG
# else:
#     patients_to_remove = random.sample(HG_inst.Patients, len(HG_inst.Patients) - num_Sample)
#     print(len(patients_to_remove), num_Sample, len(HG_inst.Patients))
    
#     # deleting the nodes
#     HG = gHG.remove_patients_and_linked_visits(patients_to_remove, HG_inst.HG)
# # =================================================================================

## Whole graph or Sample graph

In [5]:
HG_obj = heterogeneous_Graph(complete_HG)

gHG.G_statistics(HG_obj.HG)

# ======================To sample or not to sample, that is the question =========================
if not sampling:
    num_Sample = len(HG_obj.Patients)
    HG = HG_obj.HG
else:
    patients_to_remove = random.sample(HG_obj.Patients, len(HG_obj.Patients) - num_Sample)
    print(len(patients_to_remove), num_Sample, len(HG_obj.Patients))
    
    # deleting the nodes
    HG = gHG.remove_patients_and_linked_visits(patients_to_remove, HG_obj.HG)

number of patients = 46437
number of visits = 58929
number of Medication = 592
number of Diagnoses = 203
number of Procedures = 89
number of Labs = 480
number of MicoBio = 64
number of Edges = 5336561
------------------------------------------

45937 500 46437
Number of PATIENTS to remove: 45937
Number of nodes to remove: 104124


## Extracting the features

In [6]:
from module1 import XY_preparation as XY
# ============================ Extracting Patient-based X and Y =================================
XY_inst = XY.XY_preparation(HG)
X = XY_inst.X
Y = XY_inst.Y
# ============================ Extracting Visit-based X and Y =================================
XY_inst = XY.XY_preparation(HG)
XV = XY_inst.X_visit
YV = XY_inst.Y_visit
# ==================================== Saving X and Y  (patient-based) ============================
torch.save(X, f'{saving_path}/X.pt')
torch.save(Y, f'{saving_path}/Y.pt')
# ==================================== Saving X and Y (visit-based) =================================
torch.save(X, f'{saving_path}/XV.pt')
torch.save(Y, f'{saving_path}/YV.pt')
del X
del Y
del XV
del YV


getting the feature set for all nodes
getting the feature set for all nodes: visit_level


getting the feature set for all nodes
getting the feature set for all nodes: visit_level


## Meta-path Similarities

In [7]:
from module1 import meta_path as MP
# ======================= Computing the Meta Path based Similarities ======================
MP_inst = MP.Meta_path(HG, similarity_type = 'PC', saving_path = saving_path)
# ==================================== SAVING =============================================
nx.write_gml(HG, f'{saving_path}/HG.gml')
save_list_as_pickle(MP_inst.Nodes,   saving_path, 'Nodes')


extracting As from HG



[(2670, 2670), (2670, 2670), (2670, 2670), (2670, 2670), (2670, 2670), (2670, 2670)]
Multiplication phase...



multiplying (2670, 2670) * (2670, 2670) in parallel...


Done multiplication...
multiplying (2670, 2670) * (2670, 2670) in parallel...


Done multiplication...
multiplying (2670, 2670) * (2670, 2670) in parallel...


Done multiplication...
multiplying (2670, 2670) * (2670, 2670) in parallel...


Done multiplication...
multiplying (2670, 2670) * (2670, 2670) in parallel...


Done multiplication...
multiplying (2670, 2670) * (2670, 2670) in parallel...


Done multiplication...
multiplying (2670, 2670) * (2670, 2670) in parallel...


Done multiplication...
multiplying (2670, 2670) * (2670, 2670) in parallel...


Done multiplication...
multiplying (2670, 2670) * (2670, 2670) in parallel...


Done multiplication...
multiplying (2670, 2670) * (2670, 2670) in parallel...


Done multiplication...
multiplying (2670, 2670) * (2670, 2670) in parallel...


Done multiplication...
multiplying (2670, 2670) * (2670, 2670) in parallel...


Done multiplication...
multiplying (2670, 2670) * (2670, 2670) in parallel...


Done multiplication...
multiplying (2670, 2670) * (2670, 2670) in parallel...


Done multiplication...
multiplying (2670, 2670) * (2670, 2670) in parallel...


Done multiplication...
multiplying (2670, 2670) * (2670, 2670) in parallel...


Done multiplication...
multiplying (2670, 2670) * (2670, 2670) in parallel...
Done multiplication...
multiplying (2670, 2670) * (2670, 2670) in parallel...
Done multiplication...
multiplying (2670, 2670) * (2670, 2670) in parallel...
Done multiplication...
multiplying (2670, 2670) * (2670, 2670) in parallel...
Done multiplication...
Patient-Patient completed!

multiplying (2670, 2670) * (2670, 2670) in parallel...
Done multiplication...


multiplying (2670, 2670) * (2670, 2670) in parallel...
Done multiplication...
multiplying (2670, 2670) * (2670, 2670) in parallel...
Done multiplication...


multiplying (2670, 2670) * (2670, 2670) in parallel...
Done multiplication...
multiplying (2670, 2670) * (2670, 2670) in parallel...
Done multiplication...
visit-visit completed!

multiplying (2670, 2670) * (2670, 2670) in parallel...
Done multiplication...
multiplying (2670, 2670) * (2670, 2670) in parallel...
Done multiplication...
multiplying (2670, 2670) * (2670, 2670) in parallel...


Done multiplication...
multiplying (2670, 2670) * (2670, 2670) in parallel...
Done multiplication...
Diagnoses-Diagnoses completed!

multiplying (2670, 2670) * (2670, 2670) in parallel...
Done multiplication...
multiplying (2670, 2670) * (2670, 2670) in parallel...
Done multiplication...
multiplying (2670, 2670) * (2670, 2670) in parallel...
Done multiplication...
multiplying (2670, 2670) * (2670, 2670) in parallel...
Done multiplication...
Med-Med completed!

multiplying (2670, 2670) * (2670, 2670) in parallel...
Done multiplication...
multiplying (2670, 2670) * (2670, 2670) in parallel...
Done multiplication...
multiplying (2670, 2670) * (2670, 2670) in parallel...
Done multiplication...
multiplying (2670, 2670) * (2670, 2670) in parallel...
Done multiplication...
Proced-Proced completed!

multiplying (2670, 2670) * (2670, 2670) in parallel...
Done multiplication...
multiplying (2670, 2670) * (2670, 2670) in parallel...
Done multiplication...
multiplying (2670, 2670) * (2670, 2670) i

Done multiplication...
multiplying (2670, 2670) * (2670, 2670) in parallel...
Done multiplication...
multiplying (2670, 2670) * (2670, 2670) in parallel...
Done multiplication...
multiplying (2670, 2670) * (2670, 2670) in parallel...
Done multiplication...
Micro-Bio - Micro-Bio completed!

Multiplication phase...

Number of meta-paths = 43


saving to /lustre/home/almusawiaf/PhD_Projects/HGNN_Project2/Data/203_Diagnoses/DMPLB2/500/HGNN_data/As/selected_i.pkl
selected i = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42]


saving to /lustre/home/almusawiaf/PhD_Projects/HGNN_Project2/Data/203_Diagnoses/DMPLB2/500/HGNN_data/Nodes.pkl


## Reduction

In [8]:
from module1 import reduction as Red
reduction_obj = Red.Reduction(saving_path)

Matrix 0: 1 non-zero elements
	Saving all non-zero values... (1 non-zero elements)
Matrix 1: 62726 non-zero elements
	Saving all non-zero values... (62726 non-zero elements)
Matrix 2: 49648 non-zero elements
	Saving all non-zero values... (49648 non-zero elements)
Matrix 3: 121666 non-zero elements
	Saving all non-zero values... (121666 non-zero elements)
Matrix 4: 8048 non-zero elements
	Saving all non-zero values... (8048 non-zero elements)
Matrix 5: 764 non-zero elements
	Saving all non-zero values... (764 non-zero elements)
Matrix 6: 104009 non-zero elements
	Saving all non-zero values... (104009 non-zero elements)
Matrix 7: 77155 non-zero elements
	Saving all non-zero values... (77155 non-zero elements)
Matrix 8: 336606 non-zero elements
	Saving all non-zero values... (336606 non-zero elements)
Matrix 9: 11195 non-zero elements
	Saving all non-zero values... (11195 non-zero elements)
Matrix 10: 190 non-zero elements
	Saving all non-zero values... (190 non-zero elements)
Matrix 11:

done saving [unique edges]:  683046
Working on 0th file...


Working on 1th file...


Working on 2th file...


Working on 3th file...


Working on 4th file...


Working on 5th file...


Working on 6th file...


Working on 7th file...


Working on 8th file...


Working on 9th file...


Working on 10th file...


Working on 11th file...


Working on 12th file...


Working on 13th file...


Working on 14th file...


Working on 15th file...


Working on 16th file...


Working on 17th file...


Working on 18th file...


Working on 19th file...


Working on 20th file...


Working on 21th file...


Working on 22th file...


Working on 23th file...


Working on 24th file...


Working on 25th file...


Working on 26th file...


Working on 27th file...


Working on 28th file...


Working on 29th file...


Working on 30th file...


Working on 31th file...


Working on 32th file...


Working on 33th file...


Working on 34th file...


Working on 35th file...


Working on 36th file...


Working on 37th file...


Working on 38th file...


Working on 39th file...


Working on 40th file...


Working on 41th file...
