In [1]:
import os
import torch
import networkx as nx
import random
import pickle

# Function to save a list as a pickle file
def save_list_as_pickle(L, given_path, file_name):
    print(f'saving to {given_path}/{file_name}.pkl')
    with open(f'{given_path}/{file_name}.pkl', 'wb') as file:
        pickle.dump(L, file)

def load_dict_from_pickle(filename):
    with open(filename, 'rb') as file:
        loaded_dict = pickle.load(file)
    return loaded_dict  

# Class representing a heterogeneous graph
class heterogeneous_Graph:
    def __init__(self, G):
        self.HG = G        
        HG_Nodes = list(self.HG.nodes())
        self.Patients =    [v for v in HG_Nodes if v[0] == 'C']
        self.Visits =      [v for v in HG_Nodes if v[0] == 'V']
        self.Medications = [v for v in HG_Nodes if v[0] == 'M']
        self.Diagnosis  =  [v for v in HG_Nodes if v[0] == 'D']
        self.Procedures =  [v for v in HG_Nodes if v[0] == 'P']
        self.Labs       =  [v for v in HG_Nodes if v[0] == 'L']
        self.MicroBio   =  [v for v in HG_Nodes if v[0] == 'B']
        self.Nodes = self.Patients + self.Visits + self.Medications + self.Diagnosis + self.Procedures + self.Labs + self.MicroBio

# Example code snippet using environment variables
MIMIC_Path        = os.getenv('MIMIC_Path', '../../MIMIC_resources')
disease_data_path = os.getenv('disease_data_path', '../Data')
num_Diseases      = int(os.getenv('NUM_DISEASES', 203))  
DISEASE_FILE      = os.getenv('DISEASE_FILE', 'DMPLB2')  
similarity_type   = os.getenv('similarity_type', 'PC')  # options: PC, SPS
num_Sample        = int(os.getenv('num_Sample', 500))  
r_u_sampling      = os.getenv('r_u_sampling', 'True')  
PSGs_ing          = os.getenv('PSGs_ing', 'True')

# Convert string flags to boolean
sampling = r_u_sampling == 'True'
PSGs_ing = PSGs_ing == 'True'

# Print the settings
print(f'number of diseases (labels) = {num_Diseases}\nDisease File = {DISEASE_FILE}\nsimilarity_type = {similarity_type}\nnum_Sample = {num_Sample}\nsampling = {sampling}')

# Define the base path for saving data
base_path = f'{disease_data_path}/{num_Diseases}_Diagnoses/{DISEASE_FILE}/{num_Sample}'

# Create necessary directories
for p in ['HGNN_data', 'clinical_items', 'GMLs', 'OHV', 'PSGs']:
    os.makedirs(f'{base_path}/{p}', exist_ok=True)

saving_path = f'{base_path}/HGNN_data'
os.makedirs(f'{saving_path}/As', exist_ok=True)
os.makedirs(f'{saving_path}/edges', exist_ok=True)

# Print the saving path
print(saving_path)


from module1 import generating_HG as gHG

# Generating the heterogeneous graph (HG)
HG_inst = gHG.Generate_HG(MIMIC_Path)

# Save the HG as a GML file
nx.write_gml(HG_inst.HG, f'{disease_data_path}/{num_Diseases}_Diagnoses/complete_HG.gml')

# Print the graph's statistics
gHG.G_statistics(HG_inst.HG)
# ===========================================================================================
# Sampling or working with the whole graph
if not sampling:
    num_Sample = len(HG_inst.Patients)
    final_HG = HG_inst.HG
else:
    # Randomly sample patients to remove
    patients_to_remove = random.sample(HG_inst.Patients, len(HG_inst.Patients) - num_Sample)
    print(len(patients_to_remove), num_Sample, len(HG_inst.Patients))
    
    # Remove patients and linked visits
    final_HG = gHG.remove_patients_and_linked_visits(patients_to_remove, HG_inst.HG)

# Create a new HG instance for the sampled graph
HG_obj = heterogeneous_Graph(final_HG)

# Print statistics for the new graph
gHG.G_statistics(HG_obj.HG)

# ===========================================================================================
from module1 import XY_preparation as XY

# Extract patient-based X and Y
XY_inst = XY.XY_preparation(final_HG)
X = XY_inst.X
Y = XY_inst.Y
final_HG = XY_inst.HG

# Save the feature set X and labels Y
torch.save(X, f'{base_path}/OHV/X.pt')
torch.save(Y, f'{base_path}/OHV/Y.pt')

# Clean up memory
del X
del Y

# Extract and save superclasses
YY = XY_inst.get_Y_superclasses()
torch.save(YY, f'{base_path}/OHV/Ysuperclass.pt')
del YY

# ===========================================================================================
from module1 import meta_path_2 as MP

# Compute the meta-path-based similarities
MP_inst = MP.Meta_path(final_HG, similarity_type='PC', saving_path=saving_path)

# Save the heterogeneous graph in GML format
nx.write_gml(final_HG, f'{base_path}/GMLs/HG.gml')

# Save the nodes
save_list_as_pickle(MP_inst.Nodes, f'{base_path}/GMLs', 'Nodes')

# ===========================================================================================
from module1 import patients_sim as PS   

# Create a new heterogeneous graph instance
HG_obj2 = heterogeneous_Graph(final_HG)

# Compute and save patient similarity
PS.Patients_Similarity(HG_obj2.HG, HG_obj2.Nodes, base_path)
# ===========================================================================================
from module1 import reduction as Red

# Reduce and save the meta-path similarity matrices
Red.Reduction(base_path, PSGs=PSGs_ing)

# ===========================================================================================


number of diseases (labels) = 203
Disease File = DMPLB2
similarity_type = PC
num_Sample = 500
sampling = True
../Data/203_Diagnoses/DMPLB2/500/HGNN_data
Loading the dataframes...
Splitting lab tests


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  lab_df.loc[:, 'ITEMID_FLAG'] = lab_df['ITEMID'].astype(str) + '_' + lab_df['FLAG'].astype(str)


Number of visits here is 58151
Use the patients inside the new DataFrame....
Dropping NaN visits
General Information:
---------------------------
Number of Patients = 46517
Number of Visits = 58929
Number of Diagnosis = 203
Number of procedures = 89
Number of Medication = 592
Number of Lab tests  = 993
Number of MicroBio   = 65
+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Extracting bipartite networks...

Extracting and adding data of Visits

Extracting and adding data of Diagnosis

Extracting and adding data of Procedures

Extracting and adding data of Medications

Extracting and adding data of Lab tests

Extracting and adding data of MicroBiology tests
number of patients = 46437
number of visits = 58929
number of Medication = 592
number of Diagnoses = 203
number of Procedures = 89
number of Labs = 993
number of MicoBio = 64
number of Edges = 5360286
------------------------------------------

Removing isolated nodes
Number of PATIENTS to remove: 0
Number of nodes to re

<module1.reduction.Reduction at 0x7f3958e37ad0>

In [2]:
def load_dict_from_pickle(filename):
    with open(filename, 'rb') as file:
        loaded_dict = pickle.load(file)
    return loaded_dict  
b = load_dict_from_pickle('../Data/203_Diagnoses/DMPLB2/500/HGNN_data/edges/final_meta_paths.pkl')
a = load_dict_from_pickle('../Data/203_Diagnoses/DMPLB2/500/HGNN_data/As/metapath_list.pkl')

symmetric_diff = list(set(a).symmetric_difference(set(b)))
print("Symmetric Difference:", symmetric_diff)


Symmetric Difference: ['pvm', 'lvb', 'dvm', 'cvmvc', 'P', 'mvl', 'M', 'D', 'cvm', 'mvb', 'vmv', 'B', 'vm']


In [3]:
a

['cv',
 'vm',
 'vd',
 'vp',
 'vl',
 'vb',
 'cvm',
 'cvd',
 'cvp',
 'cvl',
 'cvb',
 'dvm',
 'dvp',
 'dvl',
 'dvb',
 'pvm',
 'pvl',
 'pvb',
 'mvl',
 'mvb',
 'lvb',
 'cvmvc',
 'cvdvc',
 'cvpvc',
 'cvlvc',
 'cvbvc',
 'vmv',
 'vdv',
 'vpv',
 'vlv',
 'vbv']

In [4]:
b

['cv',
 'vd',
 'vp',
 'vl',
 'vb',
 'cvd',
 'cvp',
 'cvl',
 'cvb',
 'dvp',
 'dvl',
 'dvb',
 'pvl',
 'pvb',
 'cvdvc',
 'cvpvc',
 'cvlvc',
 'cvbvc',
 'vdv',
 'vpv',
 'vlv',
 'vbv',
 'M',
 'D',
 'P',
 'B']