Idea: generate the heterogeneous graph for the entire MIMIC dataset


In [16]:
import numpy as np
import pandas as pd
import networkx as nx 
import pickle
import random

saving_path = 'Results/'

In [17]:
def extract3(code):
    return str(code)[:3]
def extract2(code):
    return str(code)[:2]

1. read the patients information:
    we are doing this by selecting those set of patients with admissions.

In [38]:
def load_data(selected_diagnosis, fraction):
    folder_path = f'../Data/MIMIC_resources'
    
    print('For the given diagnosis, extract the sub dataframe....')
    df_DiagnosisICD = pd.read_csv(f'{folder_path}/DIAGNOSES_ICD.csv')    # Diagnosis!
    df_DiagnosisICD.dropna(subset=['ICD9_CODE'], inplace=True)
    df_DiagnosisICD['ICD9_CODE'] = df_DiagnosisICD['ICD9_CODE'].apply(extract3)
    df_DiagnosisICD  = df_DiagnosisICD[df_DiagnosisICD['ICD9_CODE'].isin(selected_diagnosis)]
    
    print(df_DiagnosisICD.head(10))

    print('Use the patients inside the new DF....')
    all_patients = list(df_DiagnosisICD['SUBJECT_ID'].unique())
    new_Diagnosis = df_DiagnosisICD[df_DiagnosisICD['SUBJECT_ID'].isin(all_patients)]


    print('Reading data from other tables....')
    df_Admissions = pd.read_csv(f'{folder_path}/ADMISSIONS.csv')        # Admissions
    df_ProceduresICD = pd.read_csv(f'{folder_path}/PROCEDURES_ICD.csv')    # Procedures!
    df_Medications = pd.read_csv(f'{folder_path}/PRESCRIPTIONS.csv')

    # df_Patients = pd.read_csv(f'{folder_path}/PATIENTS.csv')            # Patients
    # df_Icustays = pd.read_csv(f'{folder_path}/ICUSTAYS.csv')                # ICUStays

    # Droping rows with empty core cell.
    df_ProceduresICD.dropna(subset=['ICD9_CODE'], inplace=True)
    df_Medications.dropna(subset=['drug'], inplace=True)

    # Calculate the number of elements to sample
    sample_size = int(len(all_patients) * fraction)

    # Sample the patients
    random.seed(42) 
    patients = random.sample(all_patients, sample_size)
    new_Diagnosis = new_Diagnosis[new_Diagnosis['SUBJECT_ID'].isin(patients)]


    print('Selecting the data for the existed admission.')
    new_Admissions = df_Admissions[df_Admissions['SUBJECT_ID'].isin(patients)]
    admissions = new_Admissions['HADM_ID'].unique()

    new_Procedures = df_ProceduresICD[df_ProceduresICD['HADM_ID'].isin(admissions)]
    new_Prescriptions = df_Medications[df_Medications['hadm_id'].isin(admissions)]

    # extracting the needed information
    new_Procedures['ICD9_CODE'] = new_Procedures['ICD9_CODE'].apply(extract2)
    # ----------------------------------------------------------------------------

    # extracting the unique sets of nodes of diff category.
    Procedures = sorted(new_Procedures['ICD9_CODE'].unique())
    Medication = sorted(new_Prescriptions['drug'].unique())
    Diagnosis  = new_Diagnosis['ICD9_CODE'].unique()

    print('General Information:\n---------------------------')
    print(f'Number of Patients = {len(patients)}')
    print(f'Number of Admissions = {len(admissions)}')
    print(f'Number of Diagnosis = {len(Diagnosis)}')
    print(f'Number of procedures = {len(Procedures)}')
    print(f'Number of Medication = {len(Medication)}')

    return new_Admissions, new_Diagnosis, new_Prescriptions, new_Procedures

In [39]:
def getDict2(df, id1, id2, c1, c2):
    new_df = df[[id1, id2]].copy()    
    
    # Add the prefixes to each column
    new_df.loc[:, id1] = c1 + '_' + new_df[id1].astype(str)
    new_df.loc[:, id2] = c2 + '_' + new_df[id2].astype(str)
    
    # Remove duplicate rows
    new_df = new_df.drop_duplicates()
    return new_df


def getEdges(data, id1, id2):
    # Check if data is a DataFrame and extract edges accordingly
    if isinstance(data, pd.DataFrame):
        # Extract edges from the DataFrame
        EdgesList = list(data[[id1, id2]].itertuples(index=False, name=None))
    else:
        # Assuming data is a list of dictionaries
        EdgesList = [(d[id1], d[id2]) for d in data]
    
    return EdgesList


def get_homogeneous_graphs(new_Admissions, new_Diagnosis, new_Prescriptions, new_Procedures):
    print('---------------------------------')
    print('Getting the Homogeneous graphs...')

    patient_visit_df    = getDict2(new_Diagnosis,  'SUBJECT_ID', 'HADM_ID', 'C', 'V')
    visit_diagnosis_df  = getDict2(new_Diagnosis, 'HADM_ID', 'ICD9_CODE',  'V', 'D')
    visit_procedure_df  = getDict2(new_Procedures, 'HADM_ID', 'ICD9_CODE', 'V', 'P')
    visit_medication_df = getDict2(new_Prescriptions, 'hadm_id', 'drug', 'V', 'M')

    # Edge Extractions
    CV_edges = getEdges(patient_visit_df, 'SUBJECT_ID', 'HADM_ID')
    VD_edges = getEdges(visit_diagnosis_df, 'HADM_ID', 'ICD9_CODE')
    VP_edges = getEdges(visit_procedure_df,'HADM_ID', 'ICD9_CODE')
    VM_edges = getEdges(visit_medication_df,  'hadm_id', 'drug')  

    print('Done --> Getting the Homogeneous graphs...')
    
    return   CV_edges , VD_edges , VP_edges , VM_edges



def get_Heterogeneous_graph(CV_edges , VD_edges , VP_edges , VM_edges):
    print('Creating the Heterogeneous graph...')

    edge_set = CV_edges + VD_edges + VP_edges + VM_edges #+ VI_edges

    tempG = nx.Graph()
    tempG.add_edges_from(edge_set)

    Nodes = list(tempG.nodes())
    N = len(Nodes)

    C_Nodes = [v for v in Nodes if v[0]=='C']
    V_Nodes = [v for v in Nodes if v[0]=='V']
    M_Nodes = [v for v in Nodes if v[0]=='M']
    D_Nodes = [v for v in Nodes if v[0]=='D']
    P_Nodes = [v for v in Nodes if v[0]=='P']

    print(f'number of patients = {len(C_Nodes)}')
    print(f'number of visits = {len(V_Nodes)}')
    print(f'number of Medication = {len(M_Nodes)}')
    print(f'number of Diagnoses = {len(D_Nodes)}')
    print(f'number of Procedures = {len(P_Nodes)}')

    The_Graph = nx.Graph()
    The_Graph.add_nodes_from(C_Nodes + V_Nodes + M_Nodes + D_Nodes + P_Nodes)
    The_Graph.add_edges_from(edge_set)

    print('Done --> Creating the Heterogeneous graph...')
    return The_Graph
    

In [43]:
CHOSEN_DIAGNOSES = ['D_276', 'D_584', 'D_428', 'D_427', 'D_518', 'D_285', 'D_401', 'D_599']

new_Admissions, new_Diagnosis, new_Prescriptions, new_Procedures = load_data([d[2:] for d in CHOSEN_DIAGNOSES], 0.3)

CV_edges , VD_edges , VP_edges , VM_edges = get_homogeneous_graphs(new_Admissions, new_Diagnosis, new_Prescriptions, new_Procedures)

HG = get_Heterogeneous_graph(CV_edges , VD_edges , VP_edges , VM_edges)

# Save the graph to a GML file
nx.write_gml(HG, "results/the_complete_hetero_graph.gml")


For the given diagnosis, extract the sub dataframe....
    ROW_ID  SUBJECT_ID  HADM_ID  SEQ_NUM ICD9_CODE
5     1302         109   172335      6.0       276
7     1304         109   172335      8.0       276
11    1308         109   172335     12.0       285
12    1309         109   172335     13.0       285
20    1317         109   173633      7.0       285
30    1490         112   174105      3.0       285
39    1499         114   178393      4.0       285
46    1506         115   114585      5.0       584
50    1510         115   114585      9.0       599
51    1511         115   114585     10.0       428
Use the patients inside the new DF....
Reading data from other tables....
Selecting the data for the existed admission.


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_Procedures['ICD9_CODE'] = new_Procedures['ICD9_CODE'].apply(extract2)


General Information:
---------------------------
Number of Patients = 10128
Number of Admissions = 13712
Number of Diagnosis = 8
Number of procedures = 88
Number of Medication = 364
---------------------------------
Getting the Homogeneous graphs...
Done --> Getting the Homogeneous graphs...
Creating the Heterogeneous graph...
number of patients = 10128
number of visits = 13620
number of Medication = 364
number of Diagnoses = 8
number of Procedures = 88
Done --> Creating the Heterogeneous graph...
