# Generating the data
1. Generate the Heterogeneous graph
2. Generate the feature set from the clinical notes.
3. Generate the Labels

In [1]:
import sys, os, copy
import random
from copy import deepcopy
import numpy as np
from sklearn.model_selection import train_test_split
import pickle
import scipy.sparse as sp

# OAK
sys.path.append('../data_generation')

from  ModelFunctions import *
# ============================================================================
num_Diseases = 203
# ============================================================================
print(f"Number of Diseases: {num_Diseases}")
# ============================================================================

Number of Diseases: 203


In [2]:
def G_statistics(G):
    Nodes = list(G.nodes())

    Patients =    [v for v in Nodes if v[0]=='C']
    Visits =      [v for v in Nodes if v[0]=='V']
    Medications = [v for v in Nodes if v[0]=='M']
    Diagnosis =   [v for v in Nodes if v[0]=='D']
    Procedures =  [v for v in Nodes if v[0]=='P']

    print(f'number of patients = {len(Patients)}')
    print(f'number of visits = {len(Visits)}')
    print(f'number of Medication = {len(Medications)}')
    print(f'number of Diagnoses = {len(Diagnosis)}')
    print(f'number of Procedures = {len(Procedures)}')
    print(f'number of Edges = {G.number_of_edges()}')
    
    print('------------------------------------------\n')

def calculate_percentage_of_zeros(A):
    """
    Calculate the percentage of zero values in a 2D numpy array.

    Parameters:
    A (numpy.ndarray): 2D numpy array

    Returns:
    float: Percentage of zero values
    """
    # Count the number of zero values
    num_zeros = np.count_nonzero(A == 0)

    # Count the total number of values
    total_values = A.size

    # Calculate the percentage of zero values
    percentage_zeros = (num_zeros / total_values) * 100

    return percentage_zeros

# Removing <patients to delete> from HG
def remove_patients_and_linked_visits(nodes, HG):
    '''remove patients and their visits from HG'''
    print('Number of PATIENTS to remove: ', len(nodes))
    
    new_HG = deepcopy(HG)
    nodes_to_remove = []
    for node in nodes:
        for v in HG.neighbors(node):
            if v[0]=='V':
                nodes_to_remove.append(v)
        nodes_to_remove.append(node)
    print('Number of nodes to remove: ', len(nodes_to_remove))
    new_HG.remove_nodes_from(nodes_to_remove)
    return new_HG                 


## 1. Generate the Heterogeneous graph
### The entire graph...

In [3]:
# ============================================================================
print('Creating the graph...')

new_Diagnosis, new_Prescriptions, new_Procedures = load_patients_data(num_Diseases)

# ============================================================================

CV_edges , VD_edges , VP_edges , VM_edges = get_homogeneous_graphs(new_Diagnosis, new_Prescriptions, new_Procedures)

# ============================================================================

HG = get_Heterogeneous_graph(CV_edges , VD_edges , VP_edges , VM_edges)

# ============================================================================

Patients, Visits, Medications, Diagnosis, Procedures, _ = get_Nodes(HG)
Nodes = Patients + Visits + Medications + Diagnosis  + Procedures

Creating the graph...
Use the patients inside the new DF....
For the given diagnosis, extract the sub dataframe....
Diagnoses frequency =  ['401', '427', '428', '276', '250', '414', '272', '518', '285', '584', 'V45', '530', 'V58', '599', 'E87', 'V10', '585', '403', '038', 'V30', '765', '998', '305', 'V29', '780', '997', 'V05', '424', '995', '785', '410', '244', '041', '458', '707', 'V15', '486', '996', '496', 'V12', '790', '287', 'E93', '348', '507', '493', '311', '511', '571', '412', '770', 'E88', '300', '733', '774', '278', '070', '416', '787', '578', '197', '198', '482', '327', 'V49', '274', 'V43', '572', '779', '280', 'E84', '440', '303', '425', '789', '286', '788', 'V50', '294', '600', '411', '560', '443', '288', '357', '577', '441', '682', '453', 'V44', '112', '799', '438', '345', '008', '293', '275', '426', '786', '562', '434', '715', 'E81', '784', '564', '263', '362', '433', '331', '852', '724', '296', '431', '491', '805', '576', '769', '569', '519', 'V42', '338', '807', '162',

  new_df.loc[:, id1] = c1 + '_' + new_df[id1].astype(str)
  new_df.loc[:, id2] = c2 + '_' + new_df[id2].astype(str)
  new_df.loc[:, id1] = c1 + '_' + new_df[id1].astype(str)
  new_df.loc[:, id1] = c1 + '_' + new_df[id1].astype(str)
  new_df.loc[:, id1] = c1 + '_' + new_df[id1].astype(str)


Done --> Getting the Homogeneous graphs...
Creating the Heterogeneous graph...
number of patients = 46437
number of visits = 58897
number of Medication = 592
number of Diagnoses = 203
number of Procedures = 89
Done --> Creating the Heterogeneous graph...
number of patients = 46437
number of visits = 58897
number of Medication = 592
number of Diagnoses = 203
number of Procedures = 89


In [4]:
def get_LOS(Nodes, G):
    def get_HADM_ID_LOS():
        df_admissions = pd.read_csv(f'{folder_path}/ADMISSIONS.csv')

        # Ensure ADMITTIME and DISCHTIME are in datetime format
        df_admissions['ADMITTIME'] = pd.to_datetime(df_admissions['ADMITTIME'])
        df_admissions['DISCHTIME'] = pd.to_datetime(df_admissions['DISCHTIME'])
        
        # Calculate the Length of Stay (LOS) in days
        df_admissions['LOS'] = (df_admissions['DISCHTIME'] - df_admissions['ADMITTIME']).dt.total_seconds() / (24 * 60 * 60)
        
        return df_admissions[['HADM_ID', 'LOS']]

    LOS_DF = get_HADM_ID_LOS()
    print(LOS_DF.head(5))
    
    LOS = []
    for node in Nodes:
        if node[0] == 'V':
            # Extract the LOS value for the corresponding HADM_ID
            los_value = LOS_DF[LOS_DF['HADM_ID'] == int(node[2:])]['LOS']
            
            if not los_value.empty:
                # If a value is found, append the first value of the Series
                LOS.append(los_value.iloc[0])
            else:
                # Append 0 if no value is found for that HADM_ID
                LOS.append(0)
        else:
            LOS.append(0)
    
    return LOS

# Example usage:
LOS = np.array(get_LOS(Nodes, HG))


   HADM_ID       LOS
0   165315  1.144444
1   152223  5.496528
2   124321  6.768056
3   161859  2.856944
4   129635  3.534028


In [5]:
nodes_indeces = {k: i for i, k in enumerate(Nodes)}

LOS[[nodes_indeces[p] for p in Nodes if p[0]=='V']]

array([ 6.20763889,  4.08055556, 12.06180556, ..., 16.78819444,
        5.91597222,  5.52291667])

## Removing Isolated nodes

In [6]:
isolated_nodes = [v for v in HG.nodes() if HG.degree(v)==0]
G_statistics(HG)    
new_HG = remove_patients_and_linked_visits(isolated_nodes, HG)
G_statistics(new_HG)    
del HG
HG = deepcopy(new_HG)
del new_HG


number of patients = 46437
number of visits = 58897
number of Medication = 592
number of Diagnoses = 203
number of Procedures = 89
number of Edges = 769929
------------------------------------------

Number of PATIENTS to remove:  0
Number of nodes to remove:  0
number of patients = 46437
number of visits = 58897
number of Medication = 592
number of Diagnoses = 203
number of Procedures = 89
number of Edges = 769929
------------------------------------------



## Identifying the feature set

In [8]:
Patients, Visits, Medications, Diagnosis, Procedures, _ = get_Nodes(HG)
Nodes2 = Patients + Visits + Medications + Diagnosis  + Procedures

import scipy.sparse as sp
import numpy as np

# Step 1: Create lists of node sets
patients    = [p for p in HG.nodes() if p[0] == 'C']  # Patient nodes
visits      = [v for v in HG.nodes() if v[0] == 'V']    # Visit nodes
medications = [m for m in HG.nodes() if m[0] == 'M']  # Medication nodes
diagnoses   = [d for d in HG.nodes() if d[0] == 'D']    # Diagnosis nodes
procedures  = [p for p in HG.nodes() if p[0] == 'P']   # Procedure nodes

# Combine all relevant rows and columns
row_nodes = patients + visits + medications + diagnoses + procedures
col_nodes = medications + diagnoses + procedures

# Step 2: Create index mappings for rows and columns
node_idx = {node: idx for idx, node in enumerate(Nodes)}  # Full graph node index
row_indices = [node_idx[node] for node in row_nodes]  # Indices for rows
col_indices = [node_idx[node] for node in col_nodes]  # Indices for columns

# Step 3: Extract the full adjacency matrix for the graph (this is often stored or built beforehand)
# Assuming you have the full adjacency matrix `adj_matrix` (NxN for the full graph):
adj_matrix = nx.adjacency_matrix(HG)

# Step 4: Extract the submatrix of interest (rows for Patients, Visits, Medications, Diagnoses, Procedures)
# and columns for Medications, Diagnoses, Procedures
X = adj_matrix[row_indices, :][:, col_indices]

# Now submatrix contains the rows for Patients, Visits, Medications, Diagnoses, Procedures 
# and columns for Medications, Diagnoses, Procedures


number of patients = 46437
number of visits = 58897
number of Medication = 592
number of Diagnoses = 203
number of Procedures = 89


In [9]:
print(X.shape, LOS.shape)
saving_path = f'../Data/HG'
print(saving_path)

(106218, 884) (106218,)
../Data/HG


## SAVING...

In [10]:
def save_list_as_pickle(L, given_path):
    print(f'saving to {given_path}')
    with open(given_path, 'wb') as file:
        pickle.dump(L, file)

torch.save(X, f'{saving_path}/X.pt')
torch.save(LOS, f'{saving_path}/Y.pt')
nx.write_gml(HG, f'{saving_path}/HG.gml')
save_list_as_pickle(Nodes2, f'{saving_path}/Nodes.pkl')

saving to ../Data/HG/Nodes.pkl


In [12]:
X

<106218x884 sparse array of type '<class 'numpy.int64'>'
	with 711125 stored elements in Compressed Sparse Row format>