In [3]:
import sys
sys.path.append('../')
import datasets
from learn import models
import log_reg
from dataproc import extract_wvs
from dataproc import get_discharge_summaries
from dataproc import concat_and_split
from dataproc import build_vocab
from dataproc import vocab_index_descriptions
from dataproc import word_embeddings
from constants import MIMIC_3_DIR, DATA_DIR

import numpy as np
import pandas as pd

from collections import Counter, defaultdict
import csv
import math
import operator

Let's do some data processing in a much better way, with a notebook.

First, let's define some stuff.

In [2]:
Y = 'full' #use all available labels in the dataset for prediction
notes_file = '%s/NOTEEVENTS.csv' % MIMIC_3_DIR # raw note events downloaded from MIMIC-III
vocab_size = 'full' #don't limit the vocab size to a specific number
vocab_min = 3 #discard tokens appearing in fewer than this many documents
split = [.9, .1/3, .2/3] #train/dev/test

# Data processing

## Combine diagnosis and procedure codes and reformat them

The codes in MIMIC-III are given in separate files for procedures and diagnoses, and the codes are given without periods, which might lead to collisions if we naively combine them. So we have to add the periods back in the right place.

## How many codes are there?

## Tokenize and preprocess raw text

Preprocessing time!

This will:
- Select only discharge summaries and their addenda
- remove punctuation and numeric-only tokens, removing 500 but keeping 250mg
- lowercase all tokens

In [3]:
#This reads all notes, selects only the discharge summaries, and tokenizes them, returning the output filename
disch_full_file = get_discharge_summaries.write_discharge_summaries(out_file="%s/disch_full.csv" % MIMIC_3_DIR)

118it [00:00, 1177.93it/s]

processing notes file
writing to ../mimicdata/mimic3/disch_full.csv


2083180it [01:20, 26021.31it/s]


Let's read this in and see what kind of data we're working with

In [3]:
df = pd.read_csv('%s/disch_full.csv' % MIMIC_3_DIR)

In [16]:
print(df.head())
print(len(df.index))
df[df.duplicated(subset=['HADM_ID'])].head()
#print(df[df['HADM_ID'== 169684]])
new_df = df[df['HADM_ID'] == 169684]
for index, row in new_df.iterrows():
    print (row['TEXT'])

   SUBJECT_ID  HADM_ID  CHARTTIME  \
0       22532   167853        NaN   
1       13702   107527        NaN   
2       13702   167118        NaN   
3       13702   196489        NaN   
4       26880   135453        NaN   

                                                TEXT  
0  admission date discharge date service addendum...  
1  admission date discharge date date of birth se...  
2  admission date discharge date service cardioth...  
3  admission date discharge date service medicine...  
4  admission date discharge date date of birth se...  
59652
admission date discharge date date of birth sex m service cardiac surgery chief complaint chest pain vessel disease on catheterization history of present illness the patient is a year old male transferred from hospital6 to the hospital1 status post catheterization revealing vessel cardiac disease the patient presented to hospital6 with gradually increasing chest pain over the past three to four months to the point that he had chest pain 

In [20]:
#How many admissions?
len(df['HADM_ID'].unique())

52726

In [18]:
#Tokens and types
types = set()
num_tok = 0
for row in df.itertuples():
    for w in row[4].split():
        types.add(w)
        num_tok += 1

In [21]:
print("Num types", len(types))
print("Num tokens", str(num_tok))

Num types 150854
Num tokens 79801387


In [22]:
#Let's sort by SUBJECT_ID and HADM_ID to make a correspondence with the MIMIC-3 label file
df = df.sort_values(['SUBJECT_ID', 'HADM_ID'])

## Consolidate labels with set of discharge summaries

Looks like there were some HADM_ID's that didn't have discharge summaries, so they weren't included with our notes

## Append labels to notes in a single file

In [23]:
#Now let's append each instance with all of its codes
#this is pretty non-trivial so let's use this script I wrote, which requires the notes to be written to file
sorted_file = '%s/disch_full.csv' % MIMIC_3_DIR
df.to_csv(sorted_file, index=False)

## Create train/dev/test splits

In [24]:
print(df.head())

       SUBJECT_ID  HADM_ID  CHARTTIME  \
48470           3   145834        NaN   
4782            4   185777        NaN   
24476           6   107064        NaN   
22764           9   150750        NaN   
57328           9   150750        NaN   

                                                    TEXT  
48470  admission date discharge date date of birth se...  
4782   admission date discharge date date of birth se...  
24476  admission date discharge date date of birth se...  
22764  admission date discharge date date of birth se...  
57328  name known lastname known firstname unit no nu...  


In [46]:
#print(len(df))
#print(df[~(df.duplicated(subset=['SUBJECT_ID','HADM_ID', 'TEXT'])) * df.duplicated(subset=['SUBJECT_ID','HADM_ID'])])
#len(df[~df.duplicated(subset=['SUBJECT_ID','HADM_ID']) - df.duplicated(subset=['SUBJECT_ID','HADM_ID', 'TEXT'])])

## Build vocabulary from training data

In [53]:
import importlib
importlib.reload(build_vocab)
vocab_min = 3
vname = '%s/vocab.csv' % MIMIC_3_DIR
infile = '%s/disch_full.csv' % MIMIC_3_DIR
build_vocab.build_vocab(vocab_min, infile, vname)

reading in data...
removing rare terms
55418 terms qualify out of 150854 total
writing output


## Sort each data split by length for batching

In [None]:
for splt in ['train', 'dev', 'test']:
    filename = '%s/disch_%s_split.csv' % (MIMIC_3_DIR, splt)
    df = pd.read_csv(filename)
    df['length'] = df.apply(lambda row: len(str(row['TEXT']).split()), axis=1)
    df = df.sort_values(['length'])
    df.to_csv('%s/%s_full.csv' % (MIMIC_3_DIR, splt), index=False)

## Write BOW files for input to logistic regression

In [None]:
Y = 'full'
ind2w, w2ind, ind2c, c2ind, _, _ = datasets.load_lookups('%s/train_%s.csv' % (MIMIC_3_DIR, str(Y)),
                                                         '%s/vocab.csv' % MIMIC_3_DIR, 
                                                         Y=Y)

In [None]:
X, yy, hadm_ids = log_reg.construct_X_Y('%s/train_%s.csv' % (MIMIC_3_DIR, str(Y)), Y, w2ind, c2ind, 'mimic3')

In [None]:
log_reg.write_bows('%s/train_%s.csv' % (MIMIC_3_DIR, str(Y)), X, hadm_ids, yy, ind2c)

In [None]:
X, yy, hadm_ids = log_reg.construct_X_Y('%s/test_%s.csv' % (MIMIC_3_DIR, str(Y)), Y, w2ind, c2ind, 'mimic3')

In [None]:
log_reg.write_bows('%s/test_%s.csv' % (MIMIC_3_DIR, str(Y)), X, hadm_ids, yy, ind2c)

## Pre-train word embeddings

Let's train word embeddings on all words

In [5]:
import importlib
importlib.reload(word_embeddings)
w2v_file = word_embeddings.word_embeddings('full', '%s/disch_full.csv' % MIMIC_3_DIR, 100, 0, 5)

building word2vec vocab on ../mimicdata/mimic3/disch_full.csv...
training...
writing embeddings to ../mimicdata/mimic3/processed_full.w2v


## Write pre-trained word embeddings with new vocab

In [10]:
importlib.reload(extract_wvs)
#import importlib
importlib.reload(datasets)
Y = 'full'
extract_wvs.gensim_to_embeddings('%s/processed_full.w2v' % MIMIC_3_DIR, '%s/vocab.csv' % MIMIC_3_DIR, Y)

100%|██████████| 55418/55418 [02:02<00:00, 453.64it/s] 


## Pre-process code descriptions using the vocab

In [None]:
reload(vocab_index_descriptions)
reload(datasets)
vocab_index_descriptions.vocab_index_descriptions('%s/vocab.csv' % MIMIC_3_DIR,
                                                  '%s/description_vectors.vocab' % MIMIC_3_DIR)

## Filter each split to the top 50 diagnosis/procedure codes

In [None]:
Y = 50

In [None]:
#first calculate the top k
counts = Counter()
dfnl = pd.read_csv('%s/notes_labeled.csv' % MIMIC_3_DIR)
for row in dfnl.itertuples():
    for label in str(row[4]).split(';'):
        counts[label] += 1

In [None]:
codes_50 = sorted(counts.items(), key=operator.itemgetter(1), reverse=True)

In [None]:
codes_50 = [code[0] for code in codes_50[:Y]]

In [None]:
codes_50

In [None]:
with open('%s/TOP_%s_CODES.csv' % (MIMIC_3_DIR, str(Y)), 'w') as of:
    w = csv.writer(of)
    for code in codes_50:
        w.writerow([code])

In [None]:
for splt in ['train', 'dev', 'test']:
    print(splt)
    hadm_ids = set()
    with open('%s/%s_50_hadm_ids.csv' % (MIMIC_3_DIR, splt), 'r') as f:
        for line in f:
            hadm_ids.add(line.rstrip())
    with open('%s/notes_labeled.csv' % MIMIC_3_DIR, 'r') as f:
        with open('%s/%s_%s.csv' % (MIMIC_3_DIR, splt, str(Y)), 'w') as of:
            r = csv.reader(f)
            w = csv.writer(of)
            #header
            w.writerow(next(r))
            i = 0
            for row in r:
                hadm_id = row[1]
                if hadm_id not in hadm_ids:
                    continue
                codes = set(str(row[3]).split(';'))
                filtered_codes = codes.intersection(set(codes_50))
                if len(filtered_codes) > 0:
                    w.writerow(row[:3] + [';'.join(filtered_codes)])
                    i += 1

In [None]:
for splt in ['train', 'dev', 'test']:
    filename = '%s/%s_%s.csv' % (MIMIC_3_DIR, splt, str(Y))
    df = pd.read_csv(filename)
    df['length'] = df.apply(lambda row: len(str(row['TEXT']).split()), axis=1)
    df = df.sort_values(['length'])
    df.to_csv('%s/%s_%s.csv' % (MIMIC_3_DIR, splt, str(Y)), index=False)

## Write BOW files for input to logistic regression

In [None]:
Y = 50
ind2w, w2ind, ind2c, c2ind, _, _ = datasets.load_lookups('%s/train_%s.csv' % (MIMIC_3_DIR, str(Y)),
                                                         '%s/vocab.csv' % MIMIC_3_DIR, 
                                                         Y=Y)

In [None]:
X, yy, hadm_ids = log_reg.construct_X_Y('%s/train_%s.csv' % (MIMIC_3_DIR, str(Y)), Y, w2ind, c2ind, 'mimic3')

In [None]:
log_reg.write_bows('%s/train_%s.csv' % (MIMIC_3_DIR, str(Y)), X, hadm_ids, yy, ind2c)

In [None]:
X, yy, hadm_ids = log_reg.construct_X_Y('%s/test_%s.csv' % (MIMIC_3_DIR, str(Y)), Y, w2ind, c2ind, 'mimic3')

In [None]:
log_reg.write_bows('%s/test_%s.csv' % (MIMIC_3_DIR, str(Y)), X, hadm_ids, yy, ind2c)