In [1]:
import pandas as pd

In [2]:
def split_df(dataframe):
    total_rows = len(dataframe)
    train_end = int(total_rows * 0.6)
    validation_end = train_end + int(total_rows * 0.2)

    dataframe['split'] = 'train'  # Default to 'train'
    dataframe.loc[train_end:validation_end-1, 'split'] = 'validation'
    dataframe.loc[validation_end:, 'split'] = 'test'
    return dataframe

# Load

In [3]:
h_dir = './processed_hf/'

In [4]:
h_cohort = pd.read_csv(h_dir + 'cohort.csv', index_col=0)
h_diag = pd.read_csv(h_dir + 'diag.csv', index_col=0)
h_labs = pd.read_csv(h_dir + 'labs.csv', index_col=0)
h_proc = pd.read_csv(h_dir + 'proc.csv', index_col=0)
h_meds = pd.read_csv(h_dir + 'meds.csv', index_col=0)

  mask |= (ar1 == a)


In [5]:
d_dir = './processed_diabetes/'

In [6]:
d_cohort = pd.read_csv(d_dir + 'cohort.csv', index_col=0)
d_diag = pd.read_csv(d_dir + 'diag.csv', index_col=0)
d_labs = pd.read_csv(d_dir + 'labs.csv', index_col=0)
d_proc = pd.read_csv(d_dir + 'proc.csv', index_col=0)
d_meds = pd.read_csv(d_dir + 'meds.csv', index_col=0)

# Process

In [7]:
save_dir = './processed/'

In [13]:
cohort = pd.merge(h_cohort, d_cohort, how='outer', 
                  on = ['subject_id','hadm_id','admittime','dischtime','gender','ethnicity','insurance'],
                  suffixes = ('_hf', '_diabetes')).fillna(0)
cohort['insurance_01'] = cohort['insurance']=='Other'
split_dfs = []
for gender in ['F', 'M']:
    for label_hf in [0, 1]:
        for label_diabetes in [0, 1]:
            for insurance_01 in [True, False]:
                for ethnicity in ['WHITE', 'BLACK/AFRICAN AMERICAN']:
                    subset = cohort[(cohort['gender'] == gender) & 
                                    (cohort['label_hf'] == label_hf) & 
                                    (cohort['label_diabetes'] == label_diabetes) & 
                                    (cohort['ethnicity'] == ethnicity) & 
                                    (cohort['insurance_01'] == insurance_01)]
                    if not subset.empty:
                        split_dfs.append(split_df(subset.copy().reset_index()))
cohort = pd.concat(split_dfs).sort_values(['subject_id', 'admittime']).reset_index()                
cohort = cohort.drop(columns=['level_0', 'index', 'insurance_01', 'split_hf', 'split_diabetes'])
print(cohort.split.value_counts())
cohort.sample(5)

train         65276
test          21782
validation    21749
Name: split, dtype: int64


Unnamed: 0,subject_id,hadm_id,admittime,dischtime,gender,ethnicity,insurance,label_hf,label_diabetes,split
87102,18011109.0,27564859.0,2132-09-24 05:26:00,2132-10-01 12:51:00,M,WHITE,Other,0.0,0.0,test
46840,14298628.0,26821756.0,2164-03-31 15:17:00,2164-04-02 12:45:00,M,WHITE,Medicare,1.0,0.0,train
89182,18193043.0,27045774.0,2153-02-06 17:31:00,2153-02-12 14:31:00,M,WHITE,Medicare,0.0,0.0,train
63721,15870527.0,23326084.0,2163-05-26 22:20:00,2163-05-28 18:09:00,F,BLACK/AFRICAN AMERICAN,Medicare,0.0,0.0,train
31154,12852721.0,22742155.0,2121-06-05 12:28:00,2121-06-10 16:20:00,F,WHITE,Medicare,0.0,0.0,train


In [15]:
cohort.to_csv(save_dir + 'cohort.csv')
cohort.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 108807 entries, 0 to 108806
Data columns (total 10 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   subject_id      108807 non-null  float64
 1   hadm_id         108807 non-null  float64
 2   admittime       108807 non-null  object 
 3   dischtime       108807 non-null  object 
 4   gender          108807 non-null  object 
 5   ethnicity       108807 non-null  object 
 6   insurance       108807 non-null  object 
 7   label_hf        108807 non-null  float64
 8   label_diabetes  108807 non-null  float64
 9   split           108807 non-null  object 
dtypes: float64(4), object(6)
memory usage: 8.3+ MB


In [22]:
diag = pd.concat([h_diag, d_diag]).drop_duplicates().sort_values(['subject_id', 'hadm_id']).reset_index(drop=True)
diag = diag.merge(cohort[['hadm_id','label_hf','label_diabetes']], on='hadm_id')
diag.to_csv(save_dir + 'diag.csv')
diag.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1468261 entries, 0 to 1468260
Data columns (total 6 columns):
 #   Column          Non-Null Count    Dtype  
---  ------          --------------    -----  
 0   subject_id      1468261 non-null  int64  
 1   hadm_id         1468261 non-null  int64  
 2   new_icd_code    1468261 non-null  object 
 3   long_title      1468261 non-null  object 
 4   label_hf        1468261 non-null  float64
 5   label_diabetes  1468261 non-null  float64
dtypes: float64(2), int64(2), object(2)
memory usage: 78.4+ MB


In [24]:
labs = pd.concat([h_labs, d_labs]).drop_duplicates().sort_values(['subject_id', 'hadm_id']).reset_index(drop=True)
labs = labs.merge(cohort[['hadm_id','label_hf','label_diabetes']], on='hadm_id')
labs.to_csv(save_dir + 'labs.csv')
labs.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 21071602 entries, 0 to 21071601
Data columns (total 10 columns):
 #   Column               Dtype  
---  ------               -----  
 0   subject_id           int64  
 1   hadm_id              float64
 2   itemid               int64  
 3   lab_time_from_admit  object 
 4   valuenum             float64
 5   label                object 
 6   fluid                object 
 7   category             object 
 8   label_hf             float64
 9   label_diabetes       float64
dtypes: float64(4), int64(2), object(4)
memory usage: 1.7+ GB


In [25]:
proc = pd.concat([h_proc, d_proc]).drop_duplicates().sort_values(['subject_id', 'hadm_id']).reset_index(drop=True)
proc = proc.merge(cohort[['hadm_id','label_hf','label_diabetes']], on='hadm_id')
proc.to_csv(save_dir + 'proc.csv')
proc.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 60805 entries, 0 to 60804
Data columns (total 7 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   subject_id            60805 non-null  int64  
 1   hadm_id               60805 non-null  int64  
 2   icd_code              60805 non-null  object 
 3   proc_time_from_admit  60805 non-null  object 
 4   long_title            60805 non-null  object 
 5   label_hf              60805 non-null  float64
 6   label_diabetes        60805 non-null  float64
dtypes: float64(2), int64(2), object(3)
memory usage: 3.7+ MB


In [26]:
meds = pd.concat([h_meds, d_meds]).drop_duplicates().sort_values(['subject_id', 'hadm_id']).reset_index(drop=True)
meds = meds.merge(cohort[['hadm_id','label_hf','label_diabetes']], on='hadm_id')
meds.to_csv(save_dir + 'meds.csv')
meds.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2835528 entries, 0 to 2835527
Data columns (total 8 columns):
 #   Column                  Dtype  
---  ------                  -----  
 0   subject_id              int64  
 1   hadm_id                 int64  
 2   drug_name               object 
 3   start_hours_from_admit  object 
 4   stop_hours_from_admit   object 
 5   dose_val_rx             object 
 6   label_hf                float64
 7   label_diabetes          float64
dtypes: float64(2), int64(2), object(4)
memory usage: 194.7+ MB


In [27]:
!tar -zcvf processed.tar.gz processed

processed/
processed/diag.csv
processed/proc.csv
processed/meds.csv
processed/labs.csv
processed/cohort.csv
