## 1. Data Preprocessing

In [1]:
import numpy as np
import pandas as pd

### update  procedure index

In [2]:
data = pd.read_csv('mimic_PROCEDURES_ICD.csv').drop(['ROW_ID'], axis=1)
label = pd.read_csv('000_mortality label.csv').rename(columns={"subject_ID" : "SUBJECT_ID"})   
index = np.load('index.npy')
index = pd.DataFrame(index, columns=['SUBJECT_ID'])       #    2314     

In [7]:
temp1 = pd.merge(index, data, how='left', on=['SUBJECT_ID'])

In [8]:
index_new = temp1[temp1['HADM_ID'].notnull()]['SUBJECT_ID']
index_new = sorted(list(set(index_new.values.tolist())))
index_new_df = pd.DataFrame(index_new, columns=['SUBJECT_ID'])    # 2110

In [10]:
np.save('index_Yuhui.npy', index_new_df)

### under sampling

In [27]:
index = np.load('index.npy')   

In [28]:
index_df = pd.DataFrame(index, columns=['SUBJECT_ID'])

In [29]:
temp = pd.merge(index_df, label, how='left', on=['SUBJECT_ID'])
len(temp)

2078

In [33]:
print(len(temp[temp['mortality']==1]))   # died
print(len(temp[temp['mortality']==0]))   # survived

1068
1010


In [37]:
survived = temp[temp['mortality']==0]
died = temp[temp['mortality']==1].sample(len(survived))

In [38]:
mortality_updated = pd.concat([died, survived], ignore_index=True).sort_values(by='SUBJECT_ID')
mortality_updated.to_csv (r'mortality label updated.csv', index = False, header=True)

In [49]:
index_final = mortality_updated['SUBJECT_ID']
np.save('index_final.npy', index_final)

### preprocessing

In [2]:
index = np.load('index_final.npy')
index_df = pd.DataFrame(index, columns=['SUBJECT_ID'])  

data = pd.read_csv('mimic_PROCEDURES_ICD.csv').drop(['ROW_ID'], axis=1)

In [4]:
admit_time = pd.read_csv('ADMISSIONS.csv')[['HADM_ID','ADMITTIME']]

temp = pd.merge(index_df, data, how='left', on=['SUBJECT_ID'])
temp = pd.merge(temp, admit_time, how='left', on=['HADM_ID'])

In [5]:
df_dup = temp.sort_values(by=['SUBJECT_ID','ADMITTIME']).reset_index(drop=True)          ###### no drop duplicate ICD9
df_dup.head(3)     # 16631

Unnamed: 0,SUBJECT_ID,HADM_ID,SEQ_NUM,ICD9_CODE,ADMITTIME
0,109,183350,1,5523,2137-11-04 19:36:00
1,109,183350,2,9971,2137-11-04 19:36:00
2,109,183350,3,3893,2137-11-04 19:36:00


In [50]:
df = df_dup.drop_duplicates(['HADM_ID', 'ICD9_CODE'])     ###### drop duplicate ICD9  from each single admission
df.tail()    # 15283

Unnamed: 0,SUBJECT_ID,HADM_ID,SEQ_NUM,ICD9_CODE,ADMITTIME
16626,99999,113369,1,8108,2117-12-30 07:15:00
16627,99999,113369,2,8051,2117-12-30 07:15:00
16628,99999,113369,3,8162,2117-12-30 07:15:00
16629,99999,113369,4,9979,2117-12-30 07:15:00
16630,99999,113369,5,8451,2117-12-30 07:15:00


## 2. Embedding

#### 1. ICD9 duplicates dropped

In [128]:
patients = index_df['SUBJECT_ID'].values.tolist()    # patients ID list

In [129]:
codes_patients = []          # list of lists, each sub-list containing procedure codes for each patient
for patient in patients:
    codes_patient_df = df[df['SUBJECT_ID']==patient]['ICD9_CODE']
    codes_patient = []
    for code in codes_patient_df:
        codes_patient.append(str(code))
    codes_patients.append(codes_patient)

In [45]:
from gensim.models import Word2Vec
import warnings
warnings.filterwarnings('ignore')

In [130]:
# skip-gram model

model = Word2Vec(sentences=codes_patients, vector_size=128, sg=1, min_count=1, workers=4, epochs=20, window=1)  

In [131]:
max_num_procedure = max([len(i) for i in codes_patients])          # 85 words
max_num_procedure   

## each patient is one sentence
## each procedure is one word
## padding all patients/sentences to have 85(i.e. max_num_procedure) procedures/words

padding = np.random.uniform(size=128).tolist()   # uniform random array for padding

In [132]:
embedding_patients = []   # embedding for 2020 patients/sentences


for patient in patients:
    embedding_patient = []    # embedding for each patient/sentence
    for codes_patient in codes_patients[patients.index(patient)]:
        embedding_patient.append(model.wv[codes_patient])
    while len(embedding_patient) < max_num_procedure: 
        embedding_patient.append(padding)  
    embedding_patients.append(embedding_patient)

In [133]:
embedding_patients_arr = np.array(embedding_patients)
embedding_patients_arr.shape

(2020, 85, 128)

In [134]:
embedding_patients_arr_swapped = embedding_patients_arr.swapaxes(1,2)   # swap axis
embedding_patients_arr_swapped.shape

(2020, 128, 85)

In [118]:
np.save('embedding matrix.npy', embedding_patients_arr_swapped)   ################  embedding matrix

#### 2. ICD9 duplicates not dropped

In [135]:
codes_patients = []          # list of lists, each sub-list contains procedure codes for each patient
for patient in patients:
    codes_patient_df = df_dup[df_dup['SUBJECT_ID']==patient]['ICD9_CODE']
    codes_patient = []
    for code in codes_patient_df:
        codes_patient.append(str(code))
    codes_patients.append(codes_patient)

In [136]:
# skip-gram model

model = Word2Vec(sentences=codes_patients, vector_size=128, sg=1, min_count=1, workers=4, epochs=20, window=1) 

In [137]:
max_num_procedure = max([len(i) for i in codes_patients])          # 98 words
max_num_procedure   

## each patient is one sentence
## each procedure is one word
## padding all patients/sentences to have 85(i.e. max_num_procedure) procedures/words

padding = np.random.uniform(size=128).tolist()   # uniform random array for padding

In [138]:
embedding_patients = []   # embedding for 2020 patients/sentences


for patient in patients:
    embedding_patient = []    # embedding for each patient/sentence
    for codes_patient in codes_patients[patients.index(patient)]:
        embedding_patient.append(model.wv[codes_patient])
    while len(embedding_patient) < max_num_procedure: 
        embedding_patient.append(padding)  
    embedding_patients.append(embedding_patient)

In [139]:
embedding_patients_arr = np.array(embedding_patients)
embedding_patients_arr.shape

(2020, 98, 128)

In [140]:
embedding_patients_arr_swapped = embedding_patients_arr.swapaxes(1,2)   # swap axis
embedding_patients_arr_swapped.shape

(2020, 128, 98)

In [141]:
np.save('embedding matrix_dup.npy', embedding_patients_arr_swapped)   ################  embedding matrix