# Generating the data
1. Generate the Heterogeneous graph
2. Generate the feature set from the clinical notes.
3. Generate the Labels
4. Generate the k-metapath-based similarity matrices
5. Convert the As to edge-based.

In [1]:
class heterogeneous_Graph:
    def __init__(self, G):
        self.HG = G
        Nodes = list(self.HG.nodes())
        self.Patients =    [v for v in Nodes if v[0]=='C']
        self.Visits =      [v for v in Nodes if v[0]=='V']
        self.Medications = [v for v in Nodes if v[0]=='M']
        self.Diagnoses  =  [v for v in Nodes if v[0]=='D']
        self.Procedures =  [v for v in Nodes if v[0]=='P']
        self.Labs       =  [v for v in Nodes if v[0]=='L']
        self.MicroBio   =  [v for v in Nodes if v[0]=='B']
        self.Nodes = self.Patients  + self.Visits + self.Medications + self.Diagnoses + self.Procedures + self.Labs + self.MicroBio
        


In [2]:
import os
import torch
import networkx as nx
import random


def save_list_as_pickle(L, given_path, file_name):
    import pickle
    print(f'saving to {given_path}/{file_name}.pkl')
    with open(f'{given_path}/{file_name}.pkl', 'wb') as file:
        pickle.dump(L, file)

# OAK :-)
MIMIC_Path        = os.getenv('MIMIC_Path',        '/home/almusawiaf/MyDocuments/PhD_Projects/Data/MIMIC_resources')
disease_data_path = os.getenv('disease_data_path', '/home/almusawiaf/MyDocuments/PhD_Projects/HGNN_Project2/Data')

# # Check for Athena :-(
# MIMIC_Path        = os.getenv('MIMIC_Path', '/home/almusawiaf/MyDocuments/PhD_Projects/Data/MIMIC_resources')
# disease_data_path = os.getenv('disease_data_path', '/home/almusawiaf/PhD_Projects/HGNN_Project2/Data')

num_Diseases    = int(os.getenv('NUM_DISEASES', 203))  
DISEASE_FILE    = os.getenv('DISEASE_FILE', f'DMPLB2')  
similarity_type = os.getenv('similarity_type', 'PC')  # options are PC: PathCount, SPS: Symmetric PathSim

num_Sample      = int(os.getenv('num_Sample', 250))  
r_u_sampling    = os.getenv('r_u_sampling', 'True')  
SNF_ing         = os.getenv('SNF_ing', 'False')  


if r_u_sampling=='True':
    sampling = True
else:
    sampling = False

if SNF_ing=='True':
    SNF_ing = True
else:
    SNF_ing = False

print(num_Diseases, DISEASE_FILE, similarity_type, num_Sample, sampling)


203 DMPLB2 PC 250 True


In [3]:
# =================================================================================
saving_path = f'{disease_data_path}/{num_Diseases}_Diagnoses/{DISEASE_FILE}/{num_Sample}'

for p in ['HGNN_data', 'clinical_items', 'GMLs', 'OHV', 'PSGs', 'SNFs']:
    os.makedirs(f'{saving_path}/{p}', exist_ok=True)

saving_path = f'{disease_data_path}/{num_Diseases}_Diagnoses/{DISEASE_FILE}/{num_Sample}/HGNN_data'
os.makedirs(f'{saving_path}/As', exist_ok=True)
# =================================================================================

# complete_HG = nx.read_gml(f'{disease_data_path}/{num_Diseases}_Diagnoses/complete_HG.gml')


### Generating complete HG from scratch

In [4]:
from module1 import generating_HG as gHG
HG_inst = gHG.Generate_HG(MIMIC_Path)
nx.write_gml(HG_inst.HG, f'{disease_data_path}/{num_Diseases}_Diagnoses/complete_HG.gml')
gHG.G_statistics(HG_inst.HG)
# ======================To sample or not to sample, that is the question =========================
if not sampling:
    num_Sample = len(HG_inst.Patients)
    HG = HG_inst.HG
else:
    patients_to_remove = random.sample(HG_inst.Patients, len(HG_inst.Patients) - num_Sample)
    print(len(patients_to_remove), num_Sample, len(HG_inst.Patients))
    
    # deleting the nodes
    HG = gHG.remove_patients_and_linked_visits(patients_to_remove, HG_inst.HG)
# =================================================================================

Loading the dataframes...
Splitting lab tests


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  lab_df.loc[:, 'ITEMID_FLAG'] = lab_df['ITEMID'].astype(str) + '_' + lab_df['FLAG'].astype(str)


Number of visits here is 58151
Use the patients inside the new DataFrame....
Dropping NaN visits


KeyboardInterrupt: 

## Whole graph or Sample graph

In [None]:
HG_obj = heterogeneous_Graph(HG)

gHG.G_statistics(HG_obj.HG)

# ======================To sample or not to sample, that is the question =========================
if not sampling:
    num_Sample = len(HG_obj.Patients)
    HG = HG_obj.HG
else:
    patients_to_remove = random.sample(HG_obj.Patients, len(HG_obj.Patients) - num_Sample)
    print(len(patients_to_remove), num_Sample, len(HG_obj.Patients))
    
    # deleting the nodes
    HG = gHG.remove_patients_and_linked_visits(patients_to_remove, HG_obj.HG)

## Extracting the features

In [None]:
from module1 import XY_preparation as XY
# ============================ Extracting Patient-based X and Y =================================
XY_inst = XY.XY_preparation(HG)
X = XY_inst.X
Y = XY_inst.Y
# ============================ Extracting Visit-based X and Y =================================
XY_inst = XY.XY_preparation(HG)
XV = XY_inst.X_visit
YV = XY_inst.Y_visit
# ==================================== Saving X and Y  (patient-based) ============================
torch.save(X, f'{saving_path}/X.pt')
torch.save(Y, f'{saving_path}/Y.pt')
# ==================================== Saving X and Y (visit-based) =================================
torch.save(X, f'{saving_path}/XV.pt')
torch.save(Y, f'{saving_path}/YV.pt')
del X
del Y
del XV
del YV


## Meta-path Similarities

In [None]:
from module1 import meta_path as MP
# ======================= Computing the Meta Path based Similarities ======================
MP_inst = MP.Meta_path(HG, similarity_type = 'PC', saving_path = saving_path)
# ==================================== SAVING =============================================
nx.write_gml(HG, f'{saving_path}/HG.gml')
save_list_as_pickle(MP_inst.Nodes,   saving_path, 'Nodes')


## Reduction

In [None]:
from module1 import reduction as Red
reduction_obj = Red.Reduction(saving_path)