# Generating the data
1. Generate the Heterogeneous graph
2. Generate the feature set and labels.
3. Generate the k-metapath-based similarity matrices
4. Reduce and Convert the As to edge-list based.

In [None]:
import os
import torch
import networkx as nx
import random


def save_list_as_pickle(L, given_path, file_name):
    import pickle
    print(f'saving to {given_path}/{file_name}.pkl')
    with open(f'{given_path}/{file_name}.pkl', 'wb') as file:
        pickle.dump(L, file)


In [None]:
class heterogeneous_Graph:
    def __init__(self, G):
        self.HG = G
        Nodes = list(self.HG.nodes())
        self.Patients =    [v for v in Nodes if v[0]=='C']
        self.Visits =      [v for v in Nodes if v[0]=='V']
        self.Medications = [v for v in Nodes if v[0]=='M']
        self.Diagnoses  =  [v for v in Nodes if v[0]=='D']
        self.Procedures =  [v for v in Nodes if v[0]=='P']
        self.Labs       =  [v for v in Nodes if v[0]=='L']
        self.MicroBio   =  [v for v in Nodes if v[0]=='B']
        self.Nodes = self.Patients  + self.Visits + self.Medications + self.Diagnoses + self.Procedures + self.Labs + self.MicroBio
        


In [None]:

# # OAK :-)
# MIMIC_Path        = os.getenv('MIMIC_Path',        '/home/almusawiaf/MyDocuments/PhD_Projects/Data/MIMIC_resources')

# Check for Athena :-(
MIMIC_Path        = os.getenv('MIMIC_Path', '../../MIMIC_resources')

disease_data_path = os.getenv('disease_data_path', '../Data')

num_Diseases    = int(os.getenv('NUM_DISEASES', 203))  
DISEASE_FILE    = os.getenv('DISEASE_FILE', f'DMPLB2')  
similarity_type = os.getenv('similarity_type', 'PC')  # options are PC: PathCount, SPS: Symmetric PathSim

num_Sample      = int(os.getenv('num_Sample', 250))  
r_u_sampling    = os.getenv('r_u_sampling', 'True')  
PSGs_ing        = os.getenv('PSGs_ing', 'False')  


if r_u_sampling=='True':
    sampling = True
else:
    sampling = False

if PSGs_ing=='True':
    PSGs_ing = True
else:
    PSGs_ing = False

print(num_Diseases, DISEASE_FILE, similarity_type, num_Sample, sampling)


In [None]:
# =================================================================================
base_path = f'{disease_data_path}/{num_Diseases}_Diagnoses/{DISEASE_FILE}/{num_Sample}'

for p in ['HGNN_data', 'clinical_items', 'GMLs', 'OHV', 'PSGs', 'SNFs']:
    os.makedirs(f'{base_path}/{p}', exist_ok=True)

saving_path = f'{base_path}/HGNN_data'
os.makedirs(f'{saving_path}/As', exist_ok=True)
os.makedirs(f'{saving_path}/edges', exist_ok=True)
# =================================================================================
print(saving_path)


### Generating complete HG from scratch
- HG_inst: is an object that holds the heterogeneous graph information such as the graph (HG), Nodes, separated lists for the different types of nodes (Patients, visits, diagnoses, etc.)

In [None]:
from module1 import generating_HG as gHG
HG_inst = gHG.Generate_HG(MIMIC_Path)

nx.write_gml(HG_inst.HG, f'{disease_data_path}/{num_Diseases}_Diagnoses/complete_HG.gml')
gHG.G_statistics(HG_inst.HG)


## Whole graph / Sample graph / Selected Patients only?

- Do we want to work on the entire graph or a sample of graph?
- We can create our own graph accordingly, for example: 
    - Identifying specific set of patients to keep
    - Identifying the rest of patients to remove,
    - call the  **remove_patients_and_linked_visits** function to remove the **passed patient** from the **passed graph**!. 
    - This function return a new graph with the needed patients and connecting medical information.

In [None]:
# ======================To sample or not to sample, that is the question =========================
if not sampling:
    num_Sample = len(HG_inst.Patients)
    final_HG = HG_inst.HG
else:
    patients_to_remove = random.sample(HG_inst.Patients, len(HG_inst.Patients) - num_Sample)
    print(len(patients_to_remove), num_Sample, len(HG_inst.Patients))
    
    # deleting the nodes
    final_HG = gHG.remove_patients_and_linked_visits(patients_to_remove, HG_inst.HG)
# =================================================================================

# We create new HG instance for the new (partial) network
HG_obj = heterogeneous_Graph(final_HG)
gHG.G_statistics(HG_obj.HG)

## Extracting the features

In [None]:
from module1 import XY_preparation as XY
# ============================ Extracting Patient-based X and Y =================================
XY_inst = XY.XY_preparation(final_HG)
X = XY_inst.X
Y = XY_inst.Y
# ============================ Extracting Visit-based X and Y =================================
XY_inst = XY.XY_preparation(final_HG)
XV = XY_inst.X_visit
YV = XY_inst.Y_visit
# ==================================== Saving X and Y  (patient-based) ============================
torch.save(X, f'{base_path}/OHV/X.pt')
torch.save(Y, f'{base_path}/OHV/Y.pt')
# ==================================== Saving X and Y (visit-based) =================================
torch.save(X, f'{base_path}/OHV/XV.pt')
torch.save(Y, f'{base_path}/OHV/YV.pt')
del X
del Y
del XV
del YV


## Meta-path Similarities

In [None]:
from module1 import meta_path_2 as MP
# ======================= Computing the Meta Path based Similarities ======================
MP_inst = MP.Meta_path(final_HG, similarity_type = 'PC', saving_path = saving_path)
# ==================================== SAVING =============================================
nx.write_gml(final_HG, f'{base_path}/GMLs/HG.gml')
save_list_as_pickle(MP_inst.Nodes,   f'{base_path}/GMLs', 'Nodes')


## External Features

In [None]:
# Saving the data to PSGs/

from module1 import patients_sim as PS   

HG_obj2 = heterogeneous_Graph(final_HG)

PS.Patients_Similarity(HG_obj2.HG, HG_obj2.Nodes, base_path)                      

## Reduction

In [None]:
from module1 import reduction as Red
Red.Reduction(base_path, PSGs=PSGs_ing)