In [1]:
import pandas as pd
import os
import numpy as np
import re
import tqdm
from multiprocessing import Pool
import pdb

In [2]:
# this code is taken from https://github.com/tanlab/ConvolutionMedicalNer
from nltk import sent_tokenize, word_tokenize
import re
import pandas as pd

SECTION_TITLES = re.compile(
    r'('
    r'ABDOMEN AND PELVIS|CLINICAL HISTORY|CLINICAL INDICATION|COMPARISON|COMPARISON STUDY DATE'
    r'|EXAM|EXAMINATION|FINDINGS|HISTORY|IMPRESSION|INDICATION'
    r'|MEDICAL CONDITION|PROCEDURE|REASON FOR EXAM|REASON FOR STUDY|REASON FOR THIS EXAMINATION'
    r'|TECHNIQUE'
    r'):|FINAL REPORT',
    re.I | re.M)
    
    
def getSentences(t):
    return list(preprocess_mimic(t))
    
def pattern_repl(matchobj):
    """
    Return a replacement string to be used for match object
    """
    return ' '.rjust(len(matchobj.group(0)))
    
def clean_text(text):
    """
    Clean text
    """

    # Replace [**Patterns**] with spaces.
    text = re.sub(r'\[\*\*.*?\*\*\]', pattern_repl, text)
    # Replace `_` with spaces.
    text = re.sub(r'_', ' ', text)

    start = 0
    end = find_end(text)
    new_text = ''
    if start > 0:
        new_text += ' ' * start
    new_text = text[start:end]

    # make sure the new text has the same length of old text.
    if len(text) - end > 0:
        new_text += ' ' * (len(text) - end)
    return new_text

def preprocess_mimic(text):
    """
    Preprocess reports in MIMIC-III.
    1. remove [**Patterns**] and signature
    2. split the report into sections
    3. tokenize sentences and words
    4. lowercase
    """
    for sec in split_heading(clean_text(text)):
        for sent in sent_tokenize(sec):
            text = ' '.join(word_tokenize(sent))
            yield text.lower()
            
def split_heading(text):
    """Split the report into sections"""
    start = 0
    for matcher in SECTION_TITLES.finditer(text):
        # add last
        end = matcher.start()
        if end != start:
            section = text[start:end].strip()
            if section:
                yield section

        # add title
        start = end
        end = matcher.end()
        if end != start:
            section = text[start:end].strip()
            if section:
                yield section

        start = end

    # add last piece
    end = len(text)
    if start < end:
        section = text[start:end].strip()
        if section:
            yield section
            
def find_end(text):
    """Find the end of the report."""
    ends = [len(text)]
    patterns = [
        re.compile(r'BY ELECTRONICALLY SIGNING THIS REPORT', re.I),
        re.compile(r'\n {3,}DR.', re.I),
        re.compile(r'[ ]{1,}RADLINE ', re.I),
        re.compile(r'.*electronically signed on', re.I),
        re.compile(r'M\[0KM\[0KM')
    ]
    for pattern in patterns:
        matchobj = pattern.search(text)
        if matchobj:
            ends.append(matchobj.start())
    return min(ends)

In [3]:
import nltk
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to
[nltk_data]     /home/sanghoon/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [4]:
data_folder='../data/'
mimic_folder='/ssd1/mimic3'
save_path=data_folder

In [5]:
icustays=pd.read_csv(mimic_folder+'/ICUSTAYS.csv')
sub_notes_have_ts=pd.read_csv(data_folder+'/sub_note.csv') 
prev_data_t =pd.read_csv(data_folder+'/preprocessed_emb_idx.csv')
TS_data = pd.read_hdf(data_folder + '/vitals_24hourly_data_sh.h5','X')
label_merge= pd.read_csv(data_folder + '/label_have_ts_los.csv') 


  sub_notes_have_ts=pd.read_csv(data_folder+'/sub_note.csv')


In [6]:
del label_merge['Unnamed: 0']

# prerocess

In [7]:


def make_above_hours_data(data, window_size, gap_time, icustay_data):
    icustay_data=icustay_data.rename(columns={'ICUSTAY_ID':'icustay_id','INTIME':'intime'})
    data = data[data['max_hours']>window_size+gap_time]
    #data.drop(columns=['los'], inplace=True)
    data=data.astype(float)
    data=data.merge(icustay_data[['icustay_id','intime']])
    data= data.sort_values('intime',ascending=False)
    last_visit_label = data[data['subject_id'].duplicated()==False]
    print('all patient : {}'.format(len(last_visit_label['subject_id'].unique())))
    return last_visit_label

def make_note_data(note_have_ts,last_visit_label_data, window_size):
    note_have_ts=note_have_ts.rename(columns={'SUBJECT_ID':'subject_id','HADM_ID':'hadm_id'})
    df_adm_notes = pd.merge(note_have_ts[['ROW_ID','subject_id','hadm_id','CHARTTIME', 'CATEGORY','DESCRIPTION', 'TEXT']],
                            last_visit_label_data[['subject_id','hadm_id','icustay_id','intime',  'mort_icu', 'mort_hosp'
                                       ]], 
                            on = ['subject_id'],
                            how = 'left')
    df_adm_notes['CHARTTIME'] = pd.to_datetime(df_adm_notes['CHARTTIME'])
    df_adm_notes['intime'] = pd.to_datetime(df_adm_notes['intime'])

    df_less_n = df_adm_notes[((df_adm_notes['CHARTTIME']-df_adm_notes['intime']).dt.total_seconds()/(60*60))<window_size]
    df_less_n = df_less_n[((df_less_n['CHARTTIME'] - df_less_n['intime']).dt.total_seconds()/(24*60*60))>0]
    
    sub_notes = df_less_n[df_less_n.subject_id.notnull()]
    sub_notes = sub_notes[sub_notes.CHARTTIME.notnull()]
    sub_notes = sub_notes[sub_notes.TEXT.notnull()]
    sub_notes = sub_notes[['subject_id', 'hadm_id_y','CATEGORY','DESCRIPTION','CHARTTIME', 'TEXT']]
    
    sub_notes['preprocessed_text'] = None
    
    all_text = sub_notes['TEXT'].values
    with Pool(48) as pool:
        preprocessed_all_text = list(pool.imap(getSentences, all_text))
        
    sub_notes['preprocessed_text']=preprocessed_all_text
    return sub_notes
    
def filter_prev_data(prev_data, last_visit_label_data):
    prev_data=prev_data.rename(columns={'SUBJECT_ID':'subject_id'})
    prev_data=prev_data[prev_data['subject_id'].isin(last_visit_label_data['subject_id'].unique())]
    prev_data=prev_data.merge(last_visit_label_data[['subject_id','hadm_id','intime']],on='subject_id')
    prev_data=prev_data[prev_data['HADM_ID']!= prev_data['hadm_id']]
    prev_data['DISCHTIME'] = pd.to_datetime(prev_data['DISCHTIME'])
    prev_data['intime'] = pd.to_datetime(prev_data['intime'])
    prev_data=prev_data[prev_data['DISCHTIME']< prev_data['intime']]
    
    prev_data = prev_data.sort_values(['subject_id','DISCHTIME'], ascending=False)
    prev_data['count'] = prev_data.groupby('subject_id').cumcount()+1
    prev_data= prev_data[prev_data['count']<=10]
    
    print('multi visit patient : {} '.format(len(prev_data['subject_id'].unique())))
    return prev_data

def filter_monitoring_data(monitoring_data, last_visit_label_data):
    monitoring_data=monitoring_data.reset_index()
    monitoring_data=monitoring_data[monitoring_data['icustay_id'].isin(last_visit_label_data.icustay_id.unique())]
    monitoring_data=monitoring_data.groupby(['subject_id','hadm_id','icustay_id','hours_in']).first()
    
    return monitoring_data
def check_sub_have_modal(last_visit_label_data, modal , check_modal_data):
    # modal = multi_visit or note
    check_id=check_modal_data.subject_id.unique()
    last_visit_label_data[modal] =list(map(lambda x : 1 if x in check_id else 0, last_visit_label_data['subject_id']))
    return last_visit_label_data


# check 24562, 17 

In [8]:
WINDOW=24
GAP=6

In [9]:
last_visit_label_24_6 = make_above_hours_data(label_merge, window_size=WINDOW, gap_time=GAP,icustay_data=icustays)
prev_data_24_6 = filter_prev_data(prev_data_t,last_visit_label_24_6)

all patient : 32504
multi visit patient : 5751 


In [10]:
note_24_6 = make_note_data(sub_notes_have_ts,last_visit_label_24_6, WINDOW)
last_visit_label_24_6=check_sub_have_modal(last_visit_label_24_6, 'multi_visit',prev_data_24_6)
last_visit_label_24_6=check_sub_have_modal(last_visit_label_24_6, 'have_note',note_24_6)



In [11]:
ts_24_6 = filter_monitoring_data(TS_data,last_visit_label_24_6)

  monitoring_data=monitoring_data.groupby(['subject_id','hadm_id','icustay_id','hours_in']).first()


In [12]:
last_visit_label_24_6['los_3'] =(last_visit_label_24_6['los']>3).astype(float)
last_visit_label_24_6['los_7'] =(last_visit_label_24_6['los']>7).astype(float)

In [13]:
(last_visit_label_24_6['los']>3).sum() , (last_visit_label_24_6['los']>=7).sum()

(17372, 7761)

In [14]:
last_visit_label_24_6.to_csv(save_path+'label_last_visit.csv',index=False)
prev_data_24_6.to_csv(save_path+'prev_idx_data.csv', index=False)
note_24_6.to_pickle(save_path+'preprocessed_notes.pkl')
ts_24_6.to_hdf(save_path+'ts_24_6.h5','X')

  check_attribute_name(name)
  check_attribute_name(name)


In [15]:
last_visit_label_24_6

Unnamed: 0,subject_id,hadm_id,icustay_id,max_hours,mort_icu,mort_hosp,los,intime,multi_visit,have_note,los_3,los_7
18264,24562.0,166275.0,203462.0,54.0,0.0,0.0,2.2506,2210-08-18 12:34:24,1,1,0.0,0.0
23880,25723.0,127135.0,234115.0,170.0,0.0,0.0,7.1249,2209-07-31 13:52:39,1,0,1.0,1.0
2200,2846.0,195990.0,252411.0,40.0,0.0,0.0,1.6696,2209-02-09 22:48:52,1,1,0.0,0.0
39744,98185.0,116667.0,216102.0,40.0,0.0,0.0,1.6906,2208-08-19 13:03:37,1,1,0.0,0.0
5864,7632.0,183768.0,280210.0,131.0,0.0,0.0,5.4697,2208-05-27 02:33:27,1,1,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...
37149,79168.0,125272.0,293960.0,214.0,0.0,0.0,8.9252,2100-07-06 15:02:22,0,1,1.0,1.0
1377,1291.0,141087.0,299677.0,462.0,0.0,0.0,19.2676,2100-07-04 10:29:48,0,1,1.0,1.0
27566,32096.0,158366.0,240498.0,55.0,0.0,0.0,2.2924,2100-06-22 06:34:52,0,1,0.0,0.0
8781,12001.0,173927.0,222148.0,110.0,0.0,0.0,4.6201,2100-06-14 04:56:39,0,1,1.0,0.0


## train_test_split

In [16]:
SEED= 0
split_path =  save_path +'/split_'+ str(SEED) +'/'
if os.path.isdir(split_path) == False:
            os.mkdir(split_path)


In [17]:
last_visit_label_24_6=last_visit_label_24_6.set_index('hadm_id')

In [18]:
train_frac, dev_frac, test_frac = 0.7, 0.1, 0.2
hadm_ids =set(last_visit_label_24_6.index)

np.random.seed(SEED)
subjects, N = np.random.permutation(list(hadm_ids)), len(hadm_ids)
N_train, N_dev, N_test = int(train_frac * N), int(dev_frac * N), int(test_frac * N)


In [19]:
train_subj = subjects[:N_train]
dev_subj   = subjects[N_train:N_train + N_dev]
test_subj  = subjects[N_train+N_dev:]

In [20]:
        
pd.to_pickle(train_subj,split_path+'train_hadm_idx.pkl' )
pd.to_pickle(dev_subj,split_path+'dev_hadm_idx.pkl' )
pd.to_pickle(test_subj,split_path+'test_hadm_idx.pkl' )

In [21]:
len(train_subj), len(dev_subj) , len(test_subj)

(22752, 3250, 6502)

In [22]:
last_visit_label_24_6.loc[train_subj]

Unnamed: 0_level_0,subject_id,icustay_id,max_hours,mort_icu,mort_hosp,los,intime,multi_visit,have_note,los_3,los_7
hadm_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
112538.0,40866.0,290354.0,333.0,0.0,0.0,13.8916,2104-07-01 18:06:22,0,1,1.0,1.0
180033.0,83988.0,240937.0,52.0,0.0,1.0,2.1800,2143-06-19 12:17:01,0,1,0.0,0.0
194001.0,10523.0,207531.0,57.0,0.0,0.0,2.3893,2155-03-29 08:23:03,0,1,0.0,0.0
181309.0,8934.0,295861.0,35.0,0.0,1.0,1.4743,2108-12-23 11:45:29,0,1,0.0,0.0
198791.0,6241.0,213875.0,327.0,0.0,0.0,13.6259,2177-09-16 22:45:08,0,1,1.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...
167424.0,9666.0,248058.0,82.0,0.0,0.0,3.4439,2105-01-21 12:21:29,0,1,1.0,0.0
191586.0,24409.0,218258.0,39.0,0.0,0.0,1.6372,2157-02-02 21:25:08,0,1,0.0,0.0
130747.0,63074.0,289210.0,117.0,0.0,0.0,4.9085,2189-05-29 20:22:26,0,1,1.0,0.0
182269.0,15246.0,200933.0,119.0,0.0,0.0,4.9805,2157-11-13 19:03:50,0,1,1.0,0.0


# preprocess for ts

In [23]:
ts_feature=ts_24_6

In [24]:
ID_COLS           = ['subject_id', 'hadm_id', 'icustay_id']

def simple_imputer(df):
    idx = pd.IndexSlice
    df = df.copy()
    if len(df.columns.names) > 2: df.columns = df.columns.droplevel(('label', 'LEVEL1', 'LEVEL2'))
    
    df_out = df.loc[:, idx[:, ['mean', 'count']]]
    icustay_means = df_out.loc[:, idx[:, 'mean']].groupby(ID_COLS).mean()
    
    df_out.loc[:,idx[:,'mean']] = df_out.loc[:,idx[:,'mean']].groupby(ID_COLS).fillna(
        method='ffill'
    ).groupby(ID_COLS).fillna(icustay_means).fillna(0)
    
    df_out.loc[:, idx[:, 'count']] = (df.loc[:, idx[:, 'count']] > 0).astype(float)
    df_out.rename(columns={'count': 'mask'}, level='Aggregation Function', inplace=True)
    
    is_absent = (1 - df_out.loc[:, idx[:, 'mask']])
    hours_of_absence = is_absent.cumsum()
    time_since_measured = hours_of_absence - hours_of_absence[is_absent==0].fillna(method='ffill')
    time_since_measured.rename(columns={'mask': 'time_since_measured'}, level='Aggregation Function', inplace=True)

    df_out = pd.concat((df_out, time_since_measured), axis=1)
    df_out.loc[:, idx[:, 'time_since_measured']] = df_out.loc[:, idx[:, 'time_since_measured']].fillna(100)
    
    df_out.sort_index(axis=1, inplace=True)
    return df_out

In [25]:
all_sub_index=list(set(ts_feature.index.get_level_values('subject_id')))

In [26]:
time_list = [i for i in range(WINDOW)]
for sub_id in tqdm.tqdm(all_sub_index):
    sub_df=ts_feature.loc[sub_id]
    hadm_id, icu_id, _ = sub_df.index[0]
    tmp_hours=sub_df.index.get_level_values('hours_in')
    for time in time_list:
        if time not in tmp_hours:
            try:
                ts_feature.loc[sub_id,hadm_id,icu_id,time] = ts_feature.loc[sub_id,hadm_id,icu_id,time-1]
            except:
                nearest_index = tmp_hours[np.argmin(np.abs(tmp_hours - time))]
                ts_feature.loc[sub_id,hadm_id,icu_id,time] = ts_feature.loc[sub_id,hadm_id,icu_id,nearest_index]
    ts_feature.index = ts_feature.index.set_names(['subject_id','hadm_id','icustay_id','hours_in'])

100%|██████████████████████████████████████████████████████████████████| 32504/32504 [3:16:12<00:00,  2.76it/s]


In [27]:
ts_feature=ts_feature.sort_index()
ts_feature=ts_feature.loc[:,:,:,:WINDOW-1]
ts_feature.values.reshape(-1,WINDOW,312).shape

(32504, 24, 312)

In [28]:
ts_feature_filed=simple_imputer(ts_feature)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_out.rename(columns={'count': 'mask'}, level='Aggregation Function', inplace=True)


In [29]:
pd.to_pickle(ts_feature,save_path+'ts_feature.pkl')
pd.to_pickle(ts_feature_filed,save_path+'ts_feature_simple_impute.pkl')


In [30]:
train_ts = ts_feature_filed[ts_feature_filed.index.get_level_values('hadm_id').isin(train_subj)]
dev_ts = ts_feature_filed[ts_feature_filed.index.get_level_values('hadm_id').isin(dev_subj)]
test_ts = ts_feature_filed[ts_feature_filed.index.get_level_values('hadm_id').isin(test_subj)]

In [31]:
idx = pd.IndexSlice
lvl2_means, lvl2_stds = train_ts.loc[:, idx[:,'mean']].mean(axis=0), train_ts.loc[:, idx[:,'mean']].std(axis=0)

train_ts.loc[:, idx[:,'mean']] = (train_ts.loc[:, idx[:,'mean']] - lvl2_means)/lvl2_stds
dev_ts.loc[:, idx[:,'mean']] = (dev_ts.loc[:, idx[:,'mean']] - lvl2_means)/lvl2_stds
test_ts.loc[:, idx[:,'mean']] = (test_ts.loc[:, idx[:,'mean']] - lvl2_means)/lvl2_stds

In [32]:
train_ts=train_ts.loc[:, pd.IndexSlice[:, 'mean']]
dev_ts=dev_ts.loc[:, pd.IndexSlice[:, 'mean']]
test_ts=test_ts.loc[:, pd.IndexSlice[:, 'mean']]

train_ts.to_pickle(split_path+'train_ts_simple_impute.pkl')
test_ts.to_pickle(split_path+'test_ts_simple_impute.pkl')
dev_ts.to_pickle(split_path+'dev_ts_simple_impute.pkl')