# Generating the data
1. Generate the Heterogeneous graph
2. Generate the feature set from the clinical notes.
3. Generate the Labels
4. Generate the k-metapath-based similarity matrices
5. Convert the As to edge-based.

In [1]:
import os
import torch
import networkx as nx
import random


def save_list_as_pickle(L, given_path, file_name):
    import pickle
    print(f'saving to {given_path}/{file_name}.pkl')
    with open(f'{given_path}/{file_name}.pkl', 'wb') as file:
        pickle.dump(L, file)


In [2]:
class heterogeneous_Graph:
    def __init__(self, G):
        self.HG = G
        Nodes = list(self.HG.nodes())
        self.Patients =    [v for v in Nodes if v[0]=='C']
        self.Visits =      [v for v in Nodes if v[0]=='V']
        self.Medications = [v for v in Nodes if v[0]=='M']
        self.Diagnoses  =  [v for v in Nodes if v[0]=='D']
        self.Procedures =  [v for v in Nodes if v[0]=='P']
        self.Labs       =  [v for v in Nodes if v[0]=='L']
        self.MicroBio   =  [v for v in Nodes if v[0]=='B']
        self.Nodes = self.Patients  + self.Visits + self.Medications + self.Diagnoses + self.Procedures + self.Labs + self.MicroBio
        


In [3]:

# # OAK :-)
# MIMIC_Path        = os.getenv('MIMIC_Path',        '/home/almusawiaf/MyDocuments/PhD_Projects/Data/MIMIC_resources')

# Check for Athena :-(
MIMIC_Path        = os.getenv('MIMIC_Path', '../../MIMIC_resources')

disease_data_path = os.getenv('disease_data_path', '../Data')

num_Diseases    = int(os.getenv('NUM_DISEASES', 203))  
DISEASE_FILE    = os.getenv('DISEASE_FILE', f'DMPLB2')  
similarity_type = os.getenv('similarity_type', 'PC')  # options are PC: PathCount, SPS: Symmetric PathSim

num_Sample      = int(os.getenv('num_Sample', 250))  
r_u_sampling    = os.getenv('r_u_sampling', 'True')  
PSGs_ing        = os.getenv('PSGs_ing', 'False')  


if r_u_sampling=='True':
    sampling = True
else:
    sampling = False

if PSGs_ing=='True':
    PSGs_ing = True
else:
    PSGs_ing = False

print(num_Diseases, DISEASE_FILE, similarity_type, num_Sample, sampling)


203 DMPLB2 PC 250 True


In [4]:
# =================================================================================
base_path = f'{disease_data_path}/{num_Diseases}_Diagnoses/{DISEASE_FILE}/{num_Sample}'

for p in ['HGNN_data', 'clinical_items', 'GMLs', 'OHV', 'PSGs', 'SNFs']:
    os.makedirs(f'{base_path}/{p}', exist_ok=True)

saving_path = f'{base_path}/HGNN_data'
os.makedirs(f'{saving_path}/As', exist_ok=True)
os.makedirs(f'{saving_path}/edges', exist_ok=True)
# =================================================================================

# complete_HG = nx.read_gml(f'{disease_data_path}/{num_Diseases}_Diagnoses/complete_HG.gml')
print(saving_path)


../Data/203_Diagnoses/DMPLB2/250/HGNN_data


### Generating complete HG from scratch

In [5]:
from module1 import generating_HG as gHG
HG_inst = gHG.Generate_HG(MIMIC_Path)
nx.write_gml(HG_inst.HG, f'{disease_data_path}/{num_Diseases}_Diagnoses/complete_HG.gml')
gHG.G_statistics(HG_inst.HG)
# ======================To sample or not to sample, that is the question =========================
if not sampling:
    num_Sample = len(HG_inst.Patients)
    HG = HG_inst.HG
else:
    patients_to_remove = random.sample(HG_inst.Patients, len(HG_inst.Patients) - num_Sample)
    print(len(patients_to_remove), num_Sample, len(HG_inst.Patients))
    
    # deleting the nodes
    HG = gHG.remove_patients_and_linked_visits(patients_to_remove, HG_inst.HG)
# =================================================================================
complete_HG = HG_inst.HG

Loading the dataframes...
Splitting lab tests


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  lab_df.loc[:, 'ITEMID_FLAG'] = lab_df['ITEMID'].astype(str) + '_' + lab_df['FLAG'].astype(str)


Number of visits here is 58151
Use the patients inside the new DataFrame....
Dropping NaN visits
General Information:
---------------------------
Number of Patients = 46517
Number of Visits = 58929
Number of Diagnosis = 203
Number of procedures = 89
Number of Medication = 592
Number of Lab tests  = 993
Number of MicroBio   = 65
+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Extracting bipartite networks...

Extracting and adding data of Visits

Extracting and adding data of Diagnosis

Extracting and adding data of Procedures

Extracting and adding data of Medications

Extracting and adding data of Lab tests

Extracting and adding data of MicroBiology tests
number of patients = 46437
number of visits = 58929
number of Medication = 592
number of Diagnoses = 203
number of Procedures = 89
number of Labs = 993
number of MicoBio = 64
number of Edges = 5360286
------------------------------------------

Removing isolated nodes
Number of PATIENTS to remove: 0
Number of nodes to re

## Whole graph or Sample graph

In [6]:
# complete_HG = nx.read_gml(f'{disease_data_path}/{num_Diseases}_Diagnoses/complete_HG.gml')
HG_obj = heterogeneous_Graph(complete_HG)

gHG.G_statistics(HG_obj.HG)

# ======================To sample or not to sample, that is the question =========================
if not sampling:
    num_Sample = len(HG_obj.Patients)
    HG = HG_obj.HG
else:
    patients_to_remove = random.sample(HG_obj.Patients, len(HG_obj.Patients) - num_Sample)
    print(len(patients_to_remove), num_Sample, len(HG_obj.Patients))
    
    # deleting the nodes
    HG = gHG.remove_patients_and_linked_visits(patients_to_remove, HG_obj.HG)

number of patients = 250
number of visits = 452
number of Medication = 592
number of Diagnoses = 203
number of Procedures = 89
number of Labs = 480
number of MicoBio = 64
number of Edges = 35274
------------------------------------------

0 250 250
Number of PATIENTS to remove: 0
Number of nodes to remove: 0


## Extracting the features

In [7]:
from module1 import XY_preparation as XY
# ============================ Extracting Patient-based X and Y =================================
XY_inst = XY.XY_preparation(HG)
X = XY_inst.X
Y = XY_inst.Y
# ============================ Extracting Visit-based X and Y =================================
XY_inst = XY.XY_preparation(HG)
XV = XY_inst.X_visit
YV = XY_inst.Y_visit
# ==================================== Saving X and Y  (patient-based) ============================
torch.save(X, f'{base_path}/OHV/X.pt')
torch.save(Y, f'{base_path}/OHV/Y.pt')
# ==================================== Saving X and Y (visit-based) =================================
torch.save(X, f'{base_path}/OHV/XV.pt')
torch.save(Y, f'{base_path}/OHV/YV.pt')
del X
del Y
del XV
del YV


getting the feature set for all nodes
getting the feature set for all nodes: visit_level
getting the feature set for all nodes
getting the feature set for all nodes: visit_level


## Meta-path Similarities

In [8]:
from module1 import meta_path_2 as MP
# ======================= Computing the Meta Path based Similarities ======================
MP_inst = MP.Meta_path(HG, similarity_type = 'PC', saving_path = saving_path)
# ==================================== SAVING =============================================
nx.write_gml(HG, f'{base_path}/GMLs/HG.gml')
save_list_as_pickle(MP_inst.Nodes,   f'{base_path}/GMLs', 'Nodes')


extracting As from HG

Patients:
	Working on: Patient-Medication
	Working on: Patient-Diagnosis
	Working on: Patient-Procedure
	Working on: Patient-Lab
	Working on: Patient-MicroBiology
Diagnoses:
	Working on: Diagnosis-Medication
	Working on: Diagnosis-Procedure
	Working on: Diagnosis-Lab
	Working on: Diagnosis-MicroBiology
Procedures:
	Working on: Procedure-Medication
	Working on: Procedure-Lab
	Working on: Procedure-MicroBiology
	Working on: Medication-Lab
	Working on: Medication-MicroBiology
	Working on: Lab-MicroBiology
Homogeneous similarity
1. Patient-Patient
	Working on: Patient-Visit-Medication-Visit-Patient
	Working on: Patient-Visit-Diagnosis-Visit-Patient
	Working on: Patient-Visit-Procedure-Visit-Patient
	Working on: Patient-Visit-Lab-Visit-Patient
	Working on: Patient-Visit-MicroBiology-Visit-Patient
2. visit-visit
	Working on: Visit-Medication-Visit
	Working on: Visit-Diagnosis-Visit
	Working on: Visit-Procedure-Visit
	Working on: Visit-Lab-Visit
	Working on: Visit-Micro

## External Features

In [17]:
# Saving the data to PSGs/

from module1 import patients_sim as PS   

HG_obj2 = heterogeneous_Graph(HG)

PS.Patients_Similarity(HG_obj2.HG, HG_obj2.Nodes, base_path)                      

Measure the similarity, expand it and save to PSGs/M.npz
Getting the OHV for M
Measure the similarity, expand it and save to PSGs/D.npz
Getting the OHV for D
Measure the similarity, expand it and save to PSGs/P.npz
Getting the OHV for P
Measure the similarity, expand it and save to PSGs/L.npz
Getting the OHV for L
Measure the similarity, expand it and save to PSGs/B.npz
Getting the OHV for B


<__main__.Patients_Similarity at 0x7f216478d360>

## Reduction

In [19]:
from module1 import reduction as Red
Red.Reduction(base_path, PSGs=PSGs_ing)

30
Matrix 0: 327 non-zero elements
	Saving all non-zero values... (327 non-zero elements)
Matrix 1: 607 non-zero elements
	Saving all non-zero values... (607 non-zero elements)
Matrix 2: 3044 non-zero elements
	Saving all non-zero values... (3044 non-zero elements)
Matrix 3: 1097 non-zero elements
	Saving all non-zero values... (1097 non-zero elements)
Matrix 4: 29989 non-zero elements
	Saving all non-zero values... (29989 non-zero elements)
Matrix 5: 210 non-zero elements
	Saving all non-zero values... (210 non-zero elements)
Matrix 6: 141 non-zero elements
	Saving all non-zero values... (141 non-zero elements)
Matrix 7: 2433 non-zero elements
	Saving all non-zero values... (2433 non-zero elements)
Matrix 8: 817 non-zero elements
	Saving all non-zero values... (817 non-zero elements)
Matrix 9: 19757 non-zero elements
	Saving all non-zero values... (19757 non-zero elements)
Matrix 10: 158 non-zero elements
	Saving all non-zero values... (158 non-zero elements)
Matrix 11: 3713 non-zero 