A lot of the below code is using work by https://github.com/jamesmullenbach/caml-mimic

In [3]:
import pandas as pd
import csv
from tqdm import tqdm
from nltk.tokenize import RegexpTokenizer


In [4]:
MIMIC_4_DIR = "C:/Users/Buse/Desktop/Zurich/Thesis/Codebase/thesis_playground/Data/MIMIC-IV"

## Helper functions

In [3]:
def reformat(row, is_diag):
    """
        Put a period in the right place because the MIMIC-3 data files exclude them.
        Generally, procedure codes have dots after the first two digits, 
        while diagnosis codes have dots after the first three digits.
    """
    code = str(row["icd_code"])
    try:
        version = int(row["icd_version"])
    except:
        print(row, row["icd_version"])
        return code
    if version == 9:
        code = ''.join(code.split('.'))
        if is_diag:
            if code.startswith('E'):
                if len(code) > 4:
                    code = code[:4] + '.' + code[4:]
            else:
                if len(code) > 3:
                    code = code[:3] + '.' + code[3:]
        else:
            code = code[:2] + '.' + code[2:]
    return code

In [5]:
tokenizer = RegexpTokenizer(r'\w+')

In [6]:
def write_discharge_summaries(out_file):
    notes_file = '%s/discharge.csv' % (MIMIC_4_DIR)
    print("processing notes file")
    with open(notes_file, 'r', encoding="utf8") as csvfile:
        with open(out_file, 'w', encoding="utf-8") as outfile:
            print("writing to %s" % (out_file))
            outfile.write(','.join(['SUBJECT_ID', 'HADM_ID', 'CHARTTIME', 'TEXT']) + '\n')
            notereader = csv.reader(csvfile)
            #header
            next(notereader)
            i = 0
            for line in tqdm(notereader):
                subj = int(line[1])
                note = line[7]
                #tokenize, lowercase and remove numerics
                tokens = [t.lower() for t in tokenizer.tokenize(note) if not t.isnumeric()]
                text = '"' + ' '.join(tokens) + '"'
                outfile.write(','.join([line[1], line[2], line[5], text]) + '\n')
                i += 1
    return out_file


# Data processing

In [7]:
Y = 'full' #use all available labels in the dataset for prediction
notes_file = '%s/discharge.csv' % MIMIC_4_DIR # raw note events downloaded from MIMIC-IV
vocab_size = 'full' #don't limit the vocab size to a specific number
vocab_min = 3 #discard tokens appearing in fewer than this many documents

## Combine diagnosis and procedure codes and reformat them

The codes in MIMIC-III are given in separate files for procedures and diagnoses, and the codes are given without periods, which might lead to collisions if we naively combine them. So we have to add the periods back in the right place.

In [8]:
dfproc = pd.read_csv('%s/procedures_icd.csv' % MIMIC_4_DIR)
dfdiag = pd.read_csv('%s/diagnoses_icd.csv' % MIMIC_4_DIR)

In [9]:
dfdiag['absolute_code'] = dfdiag.apply(lambda row: str(reformat(row, True)), axis=1)
dfproc['absolute_code'] = dfproc.apply(lambda row: str(reformat(row, False)), axis=1)

In [10]:
dfcodes = pd.concat([dfdiag, dfproc])

In [24]:
dfcodes.to_csv('%s/all_codes.csv' % MIMIC_4_DIR, index=False,
               columns=['subject_id', 'hadm_id', 'seq_num', 'absolute_code', 'icd_version'],
               header=['SUBJECT_ID', 'HADM_ID', 'SEQ_NUM', 'ICD_CODE', 'ICD_VERSION'])

## How many codes are there?

In [12]:
#In the full dataset (not just discharge summaries)
df = pd.read_csv('%s/all_codes.csv' % MIMIC_4_DIR, dtype={"ICD_CODE": str})
len(df['ICD_CODE'].unique())

38399

In [13]:
len(df[df.ICD_VERSION == 9].ICD_CODE.unique())

11616

In [14]:
len(df[df.ICD_VERSION == 10].ICD_CODE.unique())

26788

Exploration suggests there are same labeled codes in 9 and 10

In [15]:
icd9_set_temp  = set(df[df.ICD_VERSION == 9].ICD_CODE.unique())
icd10_set_temp = set(df[df.ICD_VERSION == 10].ICD_CODE.unique())
icd9_set_temp.intersection(icd10_set_temp)

{'E848', 'E851', 'E882', 'E895', 'E896'}

They mean completely different things in ICD-9 and ICD-10 so we can ignore.

## Tokenize and preprocess raw text

Preprocessing time!

This will:
- Select only discharge summaries and their addenda
- remove punctuation and numeric-only tokens, removing 500 but keeping 250mg
- lowercase all tokens

In [16]:
#This reads all notes, selects only the discharge summaries, and tokenizes them, returning the output filename
disch_full_file = write_discharge_summaries(out_file="%s/disch_full.csv" % MIMIC_4_DIR)

processing notes file
writing to C:/Users/Buse/Desktop/Zurich/Thesis/Codebase/thesis_playground/Data/MIMIC-IV/disch_full.csv
331794it [07:37, 724.97it/s]


Let's read this in and see what kind of data we're working with

In [5]:
df = pd.read_csv('%s/disch_full.csv' % MIMIC_4_DIR)

In [6]:
#How many admissions?
len(df['HADM_ID'].unique())

331794

In [7]:
#Let's sort by SUBJECT_ID and HADM_ID to make a correspondence with the MIMIC-4 label file
df = df.sort_values(['SUBJECT_ID', 'HADM_ID'])

In [8]:
#Sort the label file by the same
dfl = pd.read_csv('%s/all_codes.csv' % MIMIC_4_DIR)
dfl = dfl.sort_values(['SUBJECT_ID', 'HADM_ID'])

In [9]:
len(df['HADM_ID'].unique()), len(dfl['HADM_ID'].unique())

(331794, 430876)

## Consolidate labels with set of discharge summaries

Looks like there were some HADM_ID's that didn't have discharge summaries, so they weren't included with our notes

In [23]:
#Let's filter out these HADM_ID's
hadm_ids = set(df['HADM_ID'])
with open('%s/all_codes.csv' % MIMIC_4_DIR, 'r') as lf:
    with open('%s/all_codes_filtered.csv' % MIMIC_4_DIR, 'w') as of:
        w = csv.writer(of)
        w.writerow(['SUBJECT_ID', 'HADM_ID', 'ICD_CODE', 'ICD_VERSION', 'ADMITTIME', 'DISCHTIME'])
        r = csv.reader(lf)
        #header
        next(r)
        for i,row in enumerate(r):
            hadm_id = int(row[1])
            #print(hadm_id)
            #break
            if hadm_id in hadm_ids:
                w.writerow(row[0:2] + row[3:5] + ['', ''])

In [24]:
dfl = pd.read_csv('%s/all_codes_filtered.csv' % MIMIC_4_DIR, index_col=None)

In [25]:
len(dfl['HADM_ID'].unique())

331669

Seems like there are some discharge summaries that don't have ICD codes, we already filtered them out as well.

In [26]:
len(hadm_ids.difference(dfl['HADM_ID'].unique()))

125

In [27]:
#we still need to sort it by HADM_ID
dfl = dfl.sort_values(['SUBJECT_ID', 'HADM_ID'])
dfl.to_csv('%s/all_codes_filtered.csv' % MIMIC_4_DIR, index=False)

## Create two seperate tables for ICD-9 and ICD-10 codes

In [30]:
dfl[dfl.ICD_VERSION == 9].to_csv('%s/all_codes_filtered_ICD9.csv' % MIMIC_4_DIR, index=False,
               columns=['SUBJECT_ID', 'HADM_ID', 'ICD_CODE', 'ADMITTIME', 'DISCHTIME'],
               header=['SUBJECT_ID', 'HADM_ID', 'ICD_CODE', 'ADMITTIME', 'DISCHTIME'])

In [32]:
len(dfl[dfl.ICD_VERSION == 9].ICD_CODE.unique())

11331

In [38]:
len(dfl[dfl.ICD_VERSION == 9].HADM_ID.unique())

209359

In [31]:
dfl[dfl.ICD_VERSION == 10].to_csv('%s/all_codes_filtered_ICD10.csv' % MIMIC_4_DIR, index=False,
               columns=['SUBJECT_ID', 'HADM_ID', 'ICD_CODE', 'ADMITTIME', 'DISCHTIME'],
               header=['SUBJECT_ID', 'HADM_ID', 'ICD_CODE', 'ADMITTIME', 'DISCHTIME'])

In [33]:
len(dfl[dfl.ICD_VERSION == 10].ICD_CODE.unique())

26098

In [39]:
len(dfl[dfl.ICD_VERSION == 10].HADM_ID.unique())

122317

## Append labels to notes in a single file

In [34]:
#Now let's append each instance with all of its codes
#this is pretty non-trivial so let's use this script I wrote, which requires the notes to be written to file
sorted_file = '%s/disch_full.csv' % MIMIC_4_DIR
df.to_csv(sorted_file, index=False)

In [35]:
from dataproc import concat_and_split
labeled = concat_and_split.concat_data_respiratory('%s/all_codes_filtered_ICD9.csv' % MIMIC_4_DIR, sorted_file, '%s/notes_labeled_ICD9.csv' % MIMIC_4_DIR)

CONCATENATING
0 done
10000 done
20000 done
30000 done
40000 done
50000 done
60000 done
70000 done
80000 done
90000 done
100000 done
110000 done
120000 done
130000 done
140000 done
150000 done
160000 done
170000 done
180000 done
190000 done
200000 done


In [36]:
labeled = concat_and_split.concat_data_respiratory('%s/all_codes_filtered_ICD10.csv' % MIMIC_4_DIR, sorted_file, '%s/notes_labeled_ICD10.csv' % MIMIC_4_DIR)

CONCATENATING
0 done
10000 done
20000 done
30000 done
40000 done
50000 done
60000 done
70000 done
80000 done
90000 done
100000 done
110000 done
120000 done


# Filter the data to only include respiratory ICD-codes

## ICD-9

In [40]:
import re
pattern = re.compile("4[6-9][0-9]|5[0-1][0-9].*")

In [41]:
#Let's filter out ICD9_CODEs
with open('%s/all_codes_filtered_ICD9.csv' % MIMIC_4_DIR, 'r') as lf:
    with open('%s/all_codes_respiratory_ICD9.csv' % MIMIC_4_DIR, 'w') as of:
        w = csv.writer(of)
        w.writerow(['SUBJECT_ID', 'HADM_ID', 'ICD_CODE', 'ADMITTIME', 'DISCHTIME'])
        r = csv.reader(lf)
        #header
        next(r)
        for i,row in enumerate(r):
            icd_code = row[2]
            if pattern.match(icd_code):
                w.writerow(row)

In [42]:
df_res = pd.read_csv('%s/all_codes_respiratory_ICD9.csv' % MIMIC_4_DIR, index_col=None)

In [43]:
len(df_res['HADM_ID'].unique())

67057

In [44]:
#we still need to sort it by HADM_ID
df_res = df_res.sort_values(['SUBJECT_ID', 'HADM_ID'])
df_res.to_csv('%s/all_codes_respiratory_ICD9.csv' % MIMIC_4_DIR, index=False)

In [45]:
labeled_res = concat_and_split.concat_data_respiratory('%s/all_codes_respiratory_ICD9.csv' % MIMIC_4_DIR, sorted_file, '%s/notes_labeled_respiratory_ICD9.csv' % MIMIC_4_DIR)

CONCATENATING
0 done
10000 done
20000 done
30000 done
40000 done
50000 done
60000 done


In [46]:
dfnl_res = pd.read_csv(labeled_res)
len(dfnl_res['HADM_ID'].unique())

67057

In [47]:
len(dfnl_res)

67057

In [48]:
df = pd.read_csv('%s/all_codes_respiratory_ICD9.csv' % MIMIC_4_DIR, dtype={"ICD_CODE": str})
len(df['ICD_CODE'].unique())

223

In [50]:
df['ICD_CODE'].unique()

array(['496.0', '486.0', '511.89', '511.9', '482.9', '518.0', '493.9',
       '493.2', '487.1', '493.22', '491.22', '492.8', '518.89', '518.84',
       '491.21', '518.81', '477.9', '518.4', '473.9', '494.0', '462.0',
       '517.8', '478.19', '518.83', '482.41', '507.0', '482.49', '464.0',
       '511.81', '512.89', '512.1', '465.9', '511.0', '493.92', '518.82',
       '508.0', '515.0', '519.19', '510.9', '514.0', '482.0', '518.51',
       '518.5', '516.8', '482.42', '488.1', '517.2', '478.2', '484.7',
       '480.1', '484.6', '480.9', '482.39', '466.0', '487.0', '518.7',
       '481.0', '501.0', '493.0', '477.0', '490.0', '482.81', '488.02',
       '472.0', '494.1', '478.31', '491.2', '478.33', '478.6', '483.0',
       '518.53', '518.3', '495.9', '513.0', '508.1', '465.8', '519.8',
       '492.0', '519.2', '512.81', '512.8', '482.83', '519.02', '473.3',
       '482.4', '519.3', '493.1', '510.0', '518.52', '474.8', '473.0',
       '512.84', '461.0', '480.2', '516.31', '478.75', '493.02

## ICD 10


In [52]:
#Let's filter out ICD10_CODEs
pattern = re.compile("J.*")
with open('%s/all_codes_filtered_ICD10.csv' % MIMIC_4_DIR, 'r') as lf:
    with open('%s/all_codes_respiratory_ICD10.csv' % MIMIC_4_DIR, 'w') as of:
        w = csv.writer(of)
        w.writerow(['SUBJECT_ID', 'HADM_ID', 'ICD_CODE', 'ADMITTIME', 'DISCHTIME'])
        r = csv.reader(lf)
        #header
        next(r)
        for i,row in enumerate(r):
            icd_code = row[2]
            if pattern.match(icd_code):
                w.writerow(row)

In [53]:
df_res_10 = pd.read_csv('%s/all_codes_respiratory_ICD10.csv' % MIMIC_4_DIR, index_col=None)

In [54]:
len(df_res_10['HADM_ID'].unique())

42958

In [55]:
#we still need to sort it by HADM_ID
df_res_10 = df_res_10.sort_values(['SUBJECT_ID', 'HADM_ID'])
df_res_10.to_csv('%s/all_codes_respiratory_ICD10.csv' % MIMIC_4_DIR, index=False)

In [56]:
labeled_res = concat_and_split.concat_data_respiratory('%s/all_codes_respiratory_ICD10.csv' % MIMIC_4_DIR, sorted_file, '%s/notes_labeled_respiratory_ICD10.csv' % MIMIC_4_DIR)

CONCATENATING
0 done
10000 done
20000 done
30000 done
40000 done


In [57]:
dfnl_res = pd.read_csv(labeled_res)
len(dfnl_res['HADM_ID'].unique())

42958

In [58]:
df = pd.read_csv('%s/all_codes_respiratory_ICD10.csv' % MIMIC_4_DIR, dtype={"ICD_CODE": str})
len(df['ICD_CODE'].unique())

284

In [59]:
df['ICD_CODE'].unique()

array(['J441', 'J45909', 'J9602', 'J9601', 'J0190', 'J449', 'J45998',
       'J9811', 'J439', 'J189', 'J9691', 'J90', 'J4520', 'J9819', 'J910',
       'J8410', 'J690', 'J95811', 'J9600', 'J309', 'J95821', 'J9690',
       'J9621', 'J811', 'J952', 'J95851', 'J111', 'J869', 'J154', 'J9622',
       'J398', 'J159', 'J208', 'J4541', 'J040', 'J479', 'J942', 'J939',
       'J069', 'J9383', 'J9382', 'J982', 'J9809', 'J9610', 'J60',
       'J45901', 'J329', 'J342', 'J84116', 'J810', 'J15211', 'J40',
       'J029', 'J95812', 'J918', 'J3489', 'J9859', 'J9611', 'J984',
       'J302', 'J440', 'J9692', 'J0180', 'J8489', 'J705', 'J988', 'J99',
       'J09X2', 'J9572', 'J156', 'J95830', 'J151', 'J181', 'J9801',
       'J702', 'J920', 'J122', 'J158', 'J390', 'J384', 'J849', 'J61',
       'J852', 'J0110', 'J155', 'J14', 'J383', 'J310', 'J704', 'J101',
       'J948', 'J80', 'J13', 'J00', 'J471', 'J129', 'J392', 'J188',
       'J628', 'J986', 'J432', 'J9589', 'J698', 'J4550', 'J339', 'J324',
       'J9851'

## Create train/dev/test splits

In [26]:
fname = '%s/notes_labeled.csv' % MIMIC_3_DIR
base_name = "%s/disch" % MIMIC_3_DIR #for output
tr, dv, te = concat_and_split.split_data(fname, base_name=base_name)

SPLITTING
0 read
10000 read
20000 read
30000 read
40000 read
50000 read


## Build vocabulary from training data

In [27]:
vocab_min = 3
vname = '%s/vocab.csv' % MIMIC_3_DIR
build_vocab.build_vocab(vocab_min, tr, vname)

reading in data...
removing rare terms
51917 terms qualify out of 140795 total
writing output


## Sort each data split by length for batching

In [28]:
for splt in ['train', 'dev', 'test']:
    filename = '%s/disch_%s_split.csv' % (MIMIC_3_DIR, splt)
    df = pd.read_csv(filename)
    df['length'] = df.apply(lambda row: len(str(row['TEXT']).split()), axis=1)
    df = df.sort_values(['length'])
    df.to_csv('%s/%s_full.csv' % (MIMIC_3_DIR, splt), index=False)

## Pre-train word embeddings

Let's train word embeddings on all words

In [29]:
w2v_file = word_embeddings.word_embeddings('full', '%s/disch_full.csv' % MIMIC_3_DIR, 100, 0, 5)

building word2vec vocab on /nethome/jmullenbach3/replication/cnn-medical-text/mimicdata/mimic3//disch_full.csv...
training...
writing embeddings to /nethome/jmullenbach3/replication/cnn-medical-text/mimicdata/mimic3//processed_full.w2v


## Write pre-trained word embeddings with new vocab

In [30]:
extract_wvs.gensim_to_embeddings('%s/processed_full.w2v' % MIMIC_3_DIR, '%s/vocab.csv' % MIMIC_3_DIR, Y)

100%|██████████| 51917/51917 [02:58<00:00, 290.28it/s]


## Pre-process code descriptions using the vocab

In [31]:
vocab_index_descriptions.vocab_index_descriptions('%s/vocab.csv' % MIMIC_3_DIR,
                                                  '%s/description_vectors.vocab' % MIMIC_3_DIR)

100%|██████████| 22267/22267 [00:00<00:00, 62940.71it/s]


## Filter each split to the top 50 diagnosis/procedure codes

In [32]:
Y = 50

In [33]:
#first calculate the top k
counts = Counter()
dfnl = pd.read_csv('%s/notes_labeled.csv' % MIMIC_3_DIR)
for row in dfnl.itertuples():
    for label in str(row[4]).split(';'):
        counts[label] += 1

In [34]:
codes_50 = sorted(counts.items(), key=operator.itemgetter(1), reverse=True)

In [35]:
codes_50 = [code[0] for code in codes_50[:Y]]

In [36]:
codes_50

['401.9',
 '38.93',
 '428.0',
 '427.31',
 '414.01',
 '96.04',
 '96.6',
 '584.9',
 '250.00',
 '96.71',
 '272.4',
 '518.81',
 '99.04',
 '39.61',
 '599.0',
 '530.81',
 '96.72',
 '272.0',
 '285.9',
 '88.56',
 '244.9',
 '486',
 '38.91',
 '285.1',
 '36.15',
 '276.2',
 '496',
 '99.15',
 '995.92',
 'V58.61',
 '507.0',
 '038.9',
 '88.72',
 '585.9',
 '403.90',
 '311',
 '305.1',
 '37.22',
 '412',
 '33.24',
 '39.95',
 '287.5',
 '410.71',
 '276.1',
 'V45.81',
 '424.0',
 '45.13',
 'V15.82',
 '511.9',
 '37.23']

In [37]:
with open('%s/TOP_%s_CODES.csv' % (MIMIC_3_DIR, str(Y)), 'w') as of:
    w = csv.writer(of)
    for code in codes_50:
        w.writerow([code])

In [38]:
for splt in ['train', 'dev', 'test']:
    print(splt)
    hadm_ids = set()
    with open('%s/%s_50_hadm_ids.csv' % (MIMIC_3_DIR, splt), 'r') as f:
        for line in f:
            hadm_ids.add(line.rstrip())
    with open('%s/notes_labeled.csv' % MIMIC_3_DIR, 'r') as f:
        with open('%s/%s_%s.csv' % (MIMIC_3_DIR, splt, str(Y)), 'w') as of:
            r = csv.reader(f)
            w = csv.writer(of)
            #header
            w.writerow(next(r))
            i = 0
            for row in r:
                hadm_id = row[1]
                if hadm_id not in hadm_ids:
                    continue
                codes = set(str(row[3]).split(';'))
                filtered_codes = codes.intersection(set(codes_50))
                if len(filtered_codes) > 0:
                    w.writerow(row[:3] + [';'.join(filtered_codes)])
                    i += 1

train
dev
test


In [39]:
for splt in ['train', 'dev', 'test']:
    filename = '%s/%s_%s.csv' % (MIMIC_3_DIR, splt, str(Y))
    df = pd.read_csv(filename)
    df['length'] = df.apply(lambda row: len(str(row['TEXT']).split()), axis=1)
    df = df.sort_values(['length'])
    df.to_csv('%s/%s_%s.csv' % (MIMIC_3_DIR, splt, str(Y)), index=False)