In [1]:
import numpy as np
import pandas as pd

In [2]:
from typing import List, Tuple, Dict, Set, Union
import torch
import torch.nn as nn
import torch.nn.utils
import torch.nn.functional as F
from model_embeddings import ModelEmbeddings
import vocab
from torch.nn.utils.rnn import pad_packed_sequence, pack_padded_sequence
import ast

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/austinmurphy/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


## Read in data:

In [3]:
dat = pd.read_csv('data/tiny_data.csv')
print(dat.shape)
dat.head()

(100, 9)


Unnamed: 0,ROW_ID,SUBJECT_ID,HADM_ID,TEXT,ICD9_codes,high_levels,text_tok,text_tok_in,text_ready
0,174,22532,167853,serviced addendumd radiologic studiesd radiolo...,"['4254', '2762', '5119', '2639', '42731', '507...","[0, 1, 2, 6, 7]","['serviced', 'addendumd', 'radiologic', 'studi...","['serviced', 'added', 'radiologic', 'studiesd'...",serviced added radiologic studiesd radiologic ...
1,170,22532,167853,history of present illnessd the patient is an ...,"['4254', '2762', '5119', '2639', '42731', '507...","[0, 1, 2, 6, 7]","['history', 'of', 'present', 'illnessd', 'the'...","['history', 'of', 'present', 'illnessd', 'the'...",history of present illnessd the patient is an ...
2,175,13702,107527,sexd f serviced micu and then to medicine hist...,"['486', '2761', '2449', '49121', '311', '51881']","[2, 4, 7]","['sexd', 'f', 'serviced', 'micu', 'and', 'then...","['sexd', 'f', 'serviced', 'micu', 'and', 'then...",sexd f serviced micu and then to medicine hist...
3,176,13702,167118,serviced cardiothoracic allergiesd amlodipine ...,"['2762', '496', '5533', '45340']","[2, 6, 7, 8]","['serviced', 'cardiothoracic', 'allergiesd', '...","['serviced', 'cardiothoracic', 'allergiesd', '...",serviced cardiothoracic allergiesd amlodipine ...
4,177,13702,196489,serviced medicine allergiesd amlodipine attend...,"['2762', '45829', '41401', '4019', 'E9320', '5...","[2, 5, 6, 7, 8, 9, 17, 18]","['serviced', 'medicine', 'allergiesd', 'amlodi...","['serviced', 'medicine', 'allergiesd', 'amlodi...",serviced medicine allergiesd amlodipine attend...


In [4]:
import nltk
text = []
data = dat[['text_ready', 'high_levels']]

for row in data['text_ready']:
    text.append(nltk.word_tokenize(row))

In [5]:
i = 0
for doc in text:
    print(len(doc))
    if i > 20:
        break
    i += 1

84
2675
2138
1523
2786
2255
1255
847
1634
862
1019
2267
2540
396
396
880
3265
1742
2765
2273
818
1022


In [6]:
ready_text = []
for row in text:
    ready_text.append(row[:500])
    

In [7]:
ready_text

[['serviced',
  'added',
  'radiologic',
  'studiesd',
  'radiologic',
  'studies',
  'also',
  'included',
  'a',
  'chest',
  'ct',
  ',',
  'which',
  'confirmed',
  'cavity',
  'lesions',
  'in',
  'the',
  'left',
  'lung',
  'apex',
  'consistent',
  'with',
  'infectious',
  'tuberculosis',
  '.',
  'this',
  'also',
  'moderately',
  'left',
  'pleural',
  'effusion',
  '.',
  'head',
  'ctd',
  'head',
  'ct',
  'showed',
  'no',
  'intracranial',
  'hemorrhage',
  'or',
  'mass',
  'effect',
  ',',
  'but',
  'old',
  'infarction',
  'consistent',
  'with',
  'past',
  'medical',
  'history',
  '.',
  'abdominal',
  'ctd',
  'abdominal',
  'ct',
  'showed',
  'lesions',
  'of',
  'dd',
  'and',
  'sacrum',
  'most',
  'likely',
  'secondary',
  'to',
  'osteoporosis',
  '.',
  'these',
  'can',
  'be',
  'followed',
  'by',
  'repeat',
  'imaging',
  'as',
  'an',
  'outpatient',
  '.',
  ',',
  'm.d',
  '.'],
 ['history',
  'of',
  'present',
  'illnessd',
  'the',
  'patien

In [8]:
# need to pad the sentences

In [9]:
data.head()

Unnamed: 0,text_ready,high_levels
0,serviced added radiologic studiesd radiologic ...,"[0, 1, 2, 6, 7]"
1,history of present illnessd the patient is an ...,"[0, 1, 2, 6, 7]"
2,sexd f serviced micu and then to medicine hist...,"[2, 4, 7]"
3,serviced cardiothoracic allergiesd amlodipine ...,"[2, 6, 7, 8]"
4,serviced medicine allergiesd amlodipine attend...,"[2, 5, 6, 7, 8, 9, 17, 18]"


In [10]:
vocab_ = [item for sublist in ready_text for item in sublist]

In [11]:
vocab_

['serviced',
 'added',
 'radiologic',
 'studiesd',
 'radiologic',
 'studies',
 'also',
 'included',
 'a',
 'chest',
 'ct',
 ',',
 'which',
 'confirmed',
 'cavity',
 'lesions',
 'in',
 'the',
 'left',
 'lung',
 'apex',
 'consistent',
 'with',
 'infectious',
 'tuberculosis',
 '.',
 'this',
 'also',
 'moderately',
 'left',
 'pleural',
 'effusion',
 '.',
 'head',
 'ctd',
 'head',
 'ct',
 'showed',
 'no',
 'intracranial',
 'hemorrhage',
 'or',
 'mass',
 'effect',
 ',',
 'but',
 'old',
 'infarction',
 'consistent',
 'with',
 'past',
 'medical',
 'history',
 '.',
 'abdominal',
 'ctd',
 'abdominal',
 'ct',
 'showed',
 'lesions',
 'of',
 'dd',
 'and',
 'sacrum',
 'most',
 'likely',
 'secondary',
 'to',
 'osteoporosis',
 '.',
 'these',
 'can',
 'be',
 'followed',
 'by',
 'repeat',
 'imaging',
 'as',
 'an',
 'outpatient',
 '.',
 ',',
 'm.d',
 '.',
 'history',
 'of',
 'present',
 'illnessd',
 'the',
 'patient',
 'is',
 'an',
 'dd',
 'year',
 'old',
 'atrial',
 'medical',
 'female',
 'who',
 'on',


In [12]:
uniq_vocab = np.unique(vocab_)

In [13]:
len(uniq_vocab)

3964

In [14]:
word2ind = {}
for i, word in enumerate(uniq_vocab):
    word2ind[word] = i

In [15]:
vocab.VocabEntry(word2ind)

KeyError: '<unk>'

In [61]:
word2ind

{',': 0,
 '-': 1,
 '--': 2,
 '-d': 3,
 '-d.ddd': 4,
 '-no': 5,
 '.': 6,
 '.d': 7,
 '/': 8,
 'a': 9,
 'a-fib': 10,
 'a-line': 11,
 'a.m.': 12,
 'a/pd': 13,
 'aa': 14,
 'aadod-ddd': 15,
 'abd': 16,
 'abdd': 17,
 'abdomen': 18,
 'abdomend': 19,
 'abdominal': 20,
 'abg': 21,
 'ability': 22,
 'able': 23,
 'abnormal': 24,
 'abnormalities': 25,
 'abnormality': 26,
 'aborted': 27,
 'about': 28,
 'above': 29,
 'abscess': 30,
 'absence': 31,
 'absent': 32,
 'abstinent': 33,
 'abuse': 34,
 'abused': 35,
 'abusing': 36,
 'ac': 37,
 'accepted': 38,
 'access': 39,
 'accessory': 40,
 'accident': 41,
 'accidentally': 42,
 'accommodation': 43,
 'accompanying': 44,
 'accomplished': 45,
 'according': 46,
 'accumulation': 47,
 'ace': 48,
 'acetaminophen': 49,
 'acid': 50,
 'acidosis': 51,
 'across': 52,
 'acs': 53,
 'activated': 54,
 'active': 55,
 'actively': 56,
 'activity': 57,
 'acute': 58,
 'acutely': 59,
 'acyclovir': 60,
 'added': 61,
 'addition': 62,
 'additional': 63,
 'additionally': 64,
 'adequ

## Create model

In [16]:
class MIMIC(nn.Module):
    """
    Simple Neural Multilabel Classification Model:
    - Bidirectional LSTM Encoder
    """
    def __init__(self, embed_size, hidden_size, vocab, dropout_rate):
        super(MIMIC, self).__init__()
        self.model_embeddings = ModelEmbeddings(embed_size, vocab)
        self.hidden_size = hidden_size
        self.dropout_rate = dropout_rate
        self.vocab = vocab
        
        self.encoder = None
        self.h_projection = None
        self.c_projection = None
        self.encoder = nn.LSTM(input_size=embed_size,
                               hidden_size=hidden_size, 
                               bias=True, 
                               bidirectional=True)
        
    def forward(self, in_sents: List[List[str]]):
        # Compute sentence lengths
        source_lengths = [len(s) for s in in_sents]
        
        # Convert list of lists into tensors
        source_padded = self.vocab.notes_.to_input_tensor(source, device=self.device)   # Tensor: (src_len, b)
        
        print(5)

In [18]:
mim = MIMIC(embed_size=len(uniq_vocab), hidden_size=100, vocab=word2ind, dropout_rate=0.2)

In [20]:
mim.forward(ready_text[:2])

5


In [32]:
labs = [[0,1,5],[0,1]]
labs_ = torch.zeros(6,2)
labs_.where([labs]) = 1
labs_

SyntaxError: can't assign to function call (<ipython-input-32-9e05bf406f6f>, line 3)

In [34]:
labs_ = torch.zeros(2,6)
print(labs_)

tensor([[0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0.]])


In [35]:
for i in range(labs_.shape[0]):
    labs_[i,labs[i]] = 1

In [36]:
labs_

tensor([[1., 1., 0., 0., 0., 1.],
        [1., 1., 0., 0., 0., 0.]])

In [5]:
class NMT(nn.Module):
    """ Simple Neural Machine Translation Model:
        - Bidrectional LSTM Encoder
        - Unidirection LSTM Decoder
        - Global Attention Model (Luong, et al. 2015)
    """
    def __init__(self, embed_size, hidden_size, vocab, dropout_rate=0.2):
        super(NMT, self).__init__()
        self.model_embeddings = ModelEmbeddings(embed_size, vocab)
        self.hidden_size = hidden_size
        self.dropout_rate = dropout_rate
        self.vocab = vocab

        # default values
        self.encoder = None 
        self.decoder = None
        self.h_projection = None
        self.c_projection = None
        self.att_projection = None
        self.combined_output_projection = None
        self.target_vocab_projection = None
        self.dropout = None
        # For sanity check only, not relevant to implementation
        self.gen_sanity_check = False
        self.counter = 0

        self.encoder = nn.LSTM(input_size=embed_size,
                                       hidden_size=hidden_size, 
                                       bias=True, 
                                       bidirectional=True)
        
        
    def forward(self, source: List[List[str]], target: List[List[str]]) -> torch.Tensor:
        """ Take a mini-batch of source and target sentences, compute the log-likelihood of
        target sentences under the language models learned by the NMT system.

        @param source (List[List[str]]): list of source sentence tokens
        @param target (List[List[str]]): list of target sentence tokens, wrapped by `<s>` and `</s>`

        @returns scores (Tensor): a variable/tensor of shape (b, ) representing the
                                    log-likelihood of generating the gold-standard target sentence for
                                    each example in the input batch. Here b = batch size.
        """
        # Compute sentence lengths
        source_lengths = [len(s) for s in source]

        # Convert list of lists into tensors
        source_padded = self.vocab.src.to_input_tensor(source, device=self.device)   # Tensor: (src_len, b)
        target_padded = self.vocab.tgt.to_input_tensor(target, device=self.device)   # Tensor: (tgt_len, b)

        ###     Run the network forward:
        ###     1. Apply the encoder to `source_padded` by calling `self.encode()`
        ###     2. Generate sentence masks for `source_padded` by calling `self.generate_sent_masks()`
        ###     3. Apply the decoder to compute combined-output by calling `self.decode()`
        ###     4. Compute log probability distribution over the target vocabulary using the
        ###        combined_outputs returned by the `self.decode()` function.

        enc_hiddens, dec_init_state = self.encode(source_padded, source_lengths)
        enc_masks = self.generate_sent_masks(enc_hiddens, source_lengths)
        combined_outputs = self.decode(enc_hiddens, enc_masks, dec_init_state, target_padded)
        P = F.log_softmax(self.target_vocab_projection(combined_outputs), dim=-1)

        # Zero out, probabilities for which we have nothing in the target text
        target_masks = (target_padded != self.vocab.tgt['<pad>']).float()
        
        # Compute log probability of generating true target words
        target_gold_words_log_prob = torch.gather(P, index=target_padded[1:].unsqueeze(-1), dim=-1).squeeze(-1) * target_masks[1:]
        scores = target_gold_words_log_prob.sum(dim=0)
        return scores
    
    
    def encode(self, source_padded: torch.Tensor, source_lengths: List[int]) -> Tuple[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
        """ Apply the encoder to source sentences to obtain encoder hidden states.
            Additionally, take the final states of the encoder and project them to obtain initial states for decoder.

        @param source_padded (Tensor): Tensor of padded source sentences with shape (src_len, b), where
                                        b = batch_size, src_len = maximum source sentence length. Note that 
                                       these have already been sorted in order of longest to shortest sentence.
        @param source_lengths (List[int]): List of actual lengths for each of the source sentences in the batch
        @returns enc_hiddens (Tensor): Tensor of hidden units with shape (b, src_len, h*2), where
                                        b = batch size, src_len = maximum source sentence length, h = hidden size.
        @returns dec_init_state (tuple(Tensor, Tensor)): Tuple of tensors representing the decoder's initial
                                                hidden state and cell.
        """
        enc_hiddens, dec_init_state = None, None

        ### YOUR CODE HERE (~ 8 Lines)
        ### TODO:
        ###     1. Construct Tensor `X` of source sentences with shape (src_len, b, e) using the source model embeddings.
        ###         src_len = maximum source sentence length, b = batch size, e = embedding size. Note
        ###         that there is no initial hidden state or cell for the decoder.
        ###     2. Compute `enc_hiddens`, `last_hidden`, `last_cell` by applying the encoder to `X`.
        ###         - Before you can apply the encoder, you need to apply the `pack_padded_sequence` function to X.
        ###         - After you apply the encoder, you need to apply the `pad_packed_sequence` function to enc_hiddens.
        ###         - Note that the shape of the tensor returned by the encoder is (src_len, b, h*2) and we want to
        ###           return a tensor of shape (b, src_len, h*2) as `enc_hiddens`.
        ###     3. Compute `dec_init_state` = (init_decoder_hidden, init_decoder_cell):
        ###         - `init_decoder_hidden`:
        ###             `last_hidden` is a tensor shape (2, b, h). The first dimension corresponds to forwards and backwards.
        ###             Concatenate the forwards and backwards tensors to obtain a tensor shape (b, 2*h).
        ###             Apply the h_projection layer to this in order to compute init_decoder_hidden.
        ###             This is h_0^{dec} in the PDF. Here b = batch size, h = hidden size
        ###         - `init_decoder_cell`:
        ###             `last_cell` is a tensor shape (2, b, h). The first dimension corresponds to forwards and backwards.
        ###             Concatenate the forwards and backwards tensors to obtain a tensor shape (b, 2*h).
        ###             Apply the c_projection layer to this in order to compute init_decoder_cell.
        ###             This is c_0^{dec} in the PDF. Here b = batch size, h = hidden size
        ###
        ### See the following docs, as you may need to use some of the following functions in your implementation:
        ###     Pack the padded sequence X before passing to the encoder:
        ###         https://pytorch.org/docs/stable/nn.html#torch.nn.utils.rnn.pack_padded_sequence
        ###     Pad the packed sequence, enc_hiddens, returned by the encoder:
        ###         https://pytorch.org/docs/stable/nn.html#torch.nn.utils.rnn.pad_packed_sequence
        ###     Tensor Concatenation:
        ###         https://pytorch.org/docs/stable/torch.html#torch.cat
        ###     Tensor Permute:
        ###         https://pytorch.org/docs/stable/tensors.html#torch.Tensor.permute

        X = self.model_embeddings.source(source_padded)
        X = pack_padded_sequence(input=X, lengths=source_lengths)
        
        enc_hiddens, (last_hidden, last_cell) = self.encoder(X)
        (enc_hiddens,list_lengths) = pad_packed_sequence(enc_hiddens)
        
        temp = torch.cat((last_hidden[0,:,:],last_hidden[1,:,:]), dim=1)
        init_decoder_hidden = self.h_projection(temp)
        
        temp = torch.cat((last_cell[0,:,:], last_cell[1,:,:]), dim=1)
        init_decoder_cell = self.c_projection(temp)
        dec_init_state = (init_decoder_hidden, init_decoder_cell)

        ### END YOUR CODE

        return enc_hiddens.permute([1,0,2]), dec_init_state

In [37]:
aset = set([1,2,3])

In [38]:
aset.update([2,3,4])

In [39]:
aset

{1, 2, 3, 4}

In [40]:
len(aset)

4

In [41]:
import ast

In [43]:
ast.literal_eval("[1,2,3]")

[1, 2, 3]