In [51]:
from __future__ import print_function
import jsonlines
import re
import json
import nltk
import io
import numpy as np
from keras.layers import Embedding
from keras.initializers import Constant
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

In [52]:
# start - <sos> tag
# stop - <eos> tag

#split records into list of impression, findings
def splitReports(File):
    
    impression_list = []
    findings_list = []
    
    with open(File,'r') as file:
        for line in file:
            all = line
        
    report_list = re.findall(r'{.*?}',all)

    for report in report_list:
        obj = json.loads(report)
        # indication_list.append(str(obj['Indication']))
        impression_list.append('start ' + str(obj['Impression']) + ' stop')
        findings_list.append(str(obj['Findings']))
        
    return (findings_list, impression_list)    

In [53]:
#get vocab of indications, impression, findings
def getVocabularySets(findings_list, impression_list):
    # indication_vocab_set = set()
    impression_vocab_set = set()
    findings_vocab_set = set()
    common_vocab_set = set()

    #for item in indication_list:
     #   for word in nltk.word_tokenize(item):
      #      indication_vocab_set.add(word.lower())
        
    for item in impression_list:
        for word in nltk.word_tokenize(item):
            impression_vocab_set.add(word.lower())
        
    for item in findings_list:
        for word in nltk.word_tokenize(item):
            findings_vocab_set.add(word.lower())
        
    # common_vocab_set.update(indication_vocab_set)
    common_vocab_set.update(impression_vocab_set)
    common_vocab_set.update(findings_vocab_set)
    
    return (common_vocab_set, findings_vocab_set, impression_vocab_set)

In [54]:
# get the embedding  matrix
def loadGloVeModel(gloVe_file):
    embedding_model = {}
    
    with io.open(gloVe_file, encoding = 'utf8') as f:
        word_embeddings = f.readlines()
        
    for word_embedding_line in word_embeddings:
        word_embedding = word_embedding_line.split()
        word = word_embedding[0]
        embedding = np.array([float(col) for col in word_embedding[1:]])
        embedding_model[word] = embedding
            
    return embedding_model

In [55]:
#get max sequences 
def getMaxSeq(List):
        
    max_len = 0
    for item in List:
        item_len = len(item.split())
        if item_len > max_len:
            max_len = item_len
    
    return max_len

In [56]:
findings, impression = splitReports('reports.jsonl')
common_vocab, findings_vocab, impression_vocab = getVocabularySets(findings, impression)

common = []
common.extend(findings)
#common.extend(indications)
common.extend(impression)

#max_indication_len = getMaxSeq(indications)
max_findings_len = getMaxSeq(findings)
max_impression_len = getMaxSeq(impression)

MAX_LEN = max(max_findings_len, max_impression_len)
MAX_LEN = 100 #99

In [57]:
embedding_model = loadGloVeModel('radglove.800M.100d.txt')

In [58]:
#embedding dimension
EMBEDDING_DIM = 100
print(MAX_LEN)

100


In [59]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(common)

word_to_index = tokenizer.word_index

#indication_sequences = tokenizer.texts_to_sequences(indications)
findings_sequences = tokenizer.texts_to_sequences(findings)
impression_sequences = tokenizer.texts_to_sequences(impression)

#indication_data = pad_sequences(indication_sequences, maxlen = MAXLEN)
findings_data = pad_sequences(findings_sequences, maxlen = MAX_LEN)
impression_data = pad_sequences(impression_sequences, maxlen = MAX_LEN)

In [60]:
index_to_word = {}
for word,index in word_to_index.items():
    index_to_word[index] = word

In [61]:
embedding_matrix = np.zeros((len(common_vocab),EMBEDDING_DIM))

for word,index in word_to_index.items():
    embedding_vector = embedding_model.get(word)
    
    if embedding_vector is not None :
        embedding_matrix[index] = embedding_vector

In [62]:
#decide maximum sequence length later...
MAX_SEQUENCE_LENGTH = MAX_LEN

# check doc.s once
# trainable=False to prevent the weights from being updated during training
embedding_layer = Embedding(len(common_vocab), EMBEDDING_DIM, embeddings_initializer=Constant(embedding_matrix), input_length=MAX_SEQUENCE_LENGTH, trainable=False)

In [63]:
#verify tokenization

In [64]:
impression[0]

'start Normal chest x-XXXX. stop'

In [65]:
for i in impression_data[0]:
    if i == 0:
        continue
    print(index_to_word[i], end = " ")

start normal chest x xxxx stop 

In [66]:
# this   is     my  life   <eos>

# <sos> this    is   my    life   <eos>

# ^^^^^ example of teacher forcing

#     start -> 8
#     stop  -> 9

m,_ = findings_data.shape

target_data = np.zeros((m, MAX_LEN), dtype = 'int32')

#impression_data.shape

# offset logic
# target_data is nothing but impression_data offset by 1 time-step

index = 0
for _m in impression_data:
    for _t in range(0,MAX_LEN):
        if _t > 0 :
            target_data[index,_t-1] = impression_data[index,_t]
    index = index + 1
    
#print(impression_data[0])
#print(target_data[0])

In [141]:
from keras.models import Model 
from keras.layers import Average, Input, GRU, Dense, TimeDistributed, Add, Activation, RepeatVector, Flatten, Permute, Lambda


# a ----> attention of findings encoder
n_features = 100
epochs = 10
batch_size = 100

##ENCODER
encoder_inputs = Input(shape = (MAX_LEN,), dtype = 'int32')

#print(encoder_inputs.shape)

encoder_embeds = embedding_layer(encoder_inputs)

#print(encoder_embeds.shape)

x1, forward_h = GRU(MAX_LEN, return_sequences = True, return_state = True)(encoder_embeds) #return state is by default True
x2, backward_h = GRU(MAX_LEN, return_sequences = True, return_state = True, go_backwards = True)(encoder_embeds)

encoder_state = Average()([forward_h, backward_h])
X = Average()([x1,x2])

#__________#__________#__________#__________#__________#_________#__________#__________#_______________________________________________________

##DECODER
decoder_inputs = Input(shape = (MAX_LEN,), dtype = 'int32')

#print(decoder_inputs.shape)

decoder_embeds = embedding_layer(decoder_inputs)

#print(decoder_embeds.shape)

decoder_outputs,state_s = GRU(100, return_sequences = True, return_state = True)(decoder_embeds, initial_state = encoder_state)

Wh_hi = TimeDistributed(Dense(1))(X)

Ws_st = TimeDistributed(Dense(1))(decoder_outputs)

W = Add()([Wh_hi, Ws_st])

e = Dense(1, activation = 'tanh')(W)

e  = Flatten()(e)

a = Activation('softmax')(e)

at = RepeatVector(MAX_LEN)(a)

at_ = Permute([2, 1])(at)

sent_representation = Multiply()([X, at_])

Context = Lambda(lambda xin: K.sum(xin, axis=1))(sent_representation)

term1 = Dense(100)(decoder_outputs)

print(term1.shape)

term2 = RepeatVector(100)(Context)

term2 = Dense(100)(term2)

print(term2.shape)

weighted_sum = Add()([term1,term2])

weighted_sum = Flatten()(weighted_sum)

tanh_weighted_sum = Activation('tanh')(weighted_sum)

probabilities = Dense(MAX_LEN, activation='softmax')(tanh_weighted_sum)


#print(decoder_outputs.shape)

#print(Context.shape)

#_________#__________#___________#__________#__________#__________#__________#__________#______________________________________________________

model = Model(inputs = [encoder_inputs, decoder_inputs], outputs = probabilities)

model.compile(optimizer = 'adam', loss = 'categorical_crossentropy')

model.fit([findings_data, impression_data], target_data, batch_size = batch_size, epochs = epochs, validation_split = 0.2)

(?, 100, 100)
(?, 100, 100)
Train on 1808 samples, validate on 453 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7fae740daa10>

In [None]:
embedding_matrix[findings_data].shape