In [120]:
from __future__ import print_function
import jsonlines
import re
import json
import nltk
import io
import numpy as np
from keras.layers import Embedding
from keras.initializers import Constant
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

In [121]:
# @ - <sos> tag
# $ - <eos> tag

#split records into list of indications, impression, findings
def splitReports(File):
    indication_list = []
    impression_list = []
    findings_list = []
    
    with open(File,'r') as file:
        for line in file:
            all = line
        
    report_list = re.findall(r'{.*?}',all)

    for report in report_list:
        obj = json.loads(report)
        indication_list.append(str(obj['Indication']))
        impression_list.append(str(obj['Impression']))
        findings_list.append(str(obj['Findings']))
        
    return (indication_list, findings_list, impression_list)    

In [122]:
#get vocab of indications, impression, findings
def getVocabularySets(indication_list, findings_list, impression_list):
    indication_vocab_set = set()
    impression_vocab_set = set()
    findings_vocab_set = set()
    common_vocab_set = set()

    for item in indication_list:
        for word in nltk.word_tokenize(item):
            indication_vocab_set.add(word.lower())
        
    for item in impression_list:
        for word in nltk.word_tokenize(item):
            impression_vocab_set.add(word.lower())
        
    for item in findings_list:
        for word in nltk.word_tokenize(item):
            findings_vocab_set.add(word.lower())
        
    common_vocab_set.update(indication_vocab_set)
    common_vocab_set.update(impression_vocab_set)
    common_vocab_set.update(findings_vocab_set)
    
    return (common_vocab_set, indication_vocab_set, findings_vocab_set, impression_vocab_set)

In [123]:
# get the embedding  matrix
def loadGloVeModel(gloVe_file):
    embedding_model = {}
    
    with io.open(gloVe_file, encoding = 'utf8') as f:
        word_embeddings = f.readlines()
        
    for word_embedding_line in word_embeddings:
        word_embedding = word_embedding_line.split()
        word = word_embedding[0]
        embedding = np.array([float(col) for col in word_embedding[1:]])
        embedding_model[word] = embedding
            
    return embedding_model

In [124]:
#get max sequences 
def getMaxSeq(List):
        
    max_len = 0
    for item in List:
        if len(item) > max_len:
            max_len = len(item)
    
    return max_len

In [126]:
indications, findings, impressions = splitReports('reports_1.jsonl')
common_vocab, indication_vocab, findings_vocab, impression_vocab = getVocabularySets(indications, impressions, findings)

common = []
common.extend(findings)
common.extend(indications)
common.extend(impressions)

max_indication_len = getMaxSeq(indications)
max_findings_len = getMaxSeq(findings)
max_impression_len = getMaxSeq(impressions)

MAXLEN = max(max_indication_len, max_findings_len, max_impression_len)

In [127]:
embedding_model = loadGloVeModel('radglove.800M.100d.txt')

In [128]:
#embedding dimension
EMBEDDING_DIM = 100

In [129]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(common)

word_to_index = tokenizer.word_index

indication_sequences = tokenizer.texts_to_sequences(indications)
findings_sequences = tokenizer.texts_to_sequences(findings)
impression_sequences = tokenizer.texts_to_sequences(impressions)

indication_data = pad_sequences(indication_sequences, maxlen = MAXLEN)
findings_data = pad_sequences(findings_sequences, maxlen = MAXLEN)
impression_data = pad_sequences(impression_sequences, maxlen = MAXLEN)

In [130]:
index_to_word = {}
for word,index in word_to_index.items():
    index_to_word[index] = word

In [131]:
embedding_matrix = np.zeros((len(common_vocab),EMBEDDING_DIM))

for word,index in word_to_index.items():
    embedding_vector = embedding_model.get(word)
    
    if embedding_vector is not None :
        embedding_matrix[index] = embedding_vector

In [132]:
#decide maximum sequence length later...
MAX_SEQUENCE_LENGTH = MAXLEN

# check doc.s once
# trainable=False to prevent the weights from being updated during training
embedding_layer = Embedding(len(common_vocab), EMBEDDING_DIM, embeddings_initializer=Constant(embedding_matrix), input_length=MAX_SEQUENCE_LENGTH, trainable=False)

In [148]:
#verify tokenization

In [149]:
findings[0]

'The cardiac silhouette and mediastinum size are within normal limits. There is no pulmonary edema. There is no focal consolidation. There are no XXXX of a pleural effusion. There is no evidence of pneumothorax.'

In [150]:
for i in findings_data[0]:
    if i == 0:
        continue
    print(index_to_word[i], end = " ")

the cardiac silhouette and mediastinum size are within normal limits there is no pulmonary edema there is no focal consolidation there are no xxxx of a pleural effusion there is no evidence of pneumothorax 

In [33]:
#model

# a ----> attention of findings encoder
# a' ---> attention of background encoder

encoder1_inputs = Input( input_shape, dtype = 'int32')

x = embedding_layer(encoder1_inputs)

x, state_h, state_c = Bidirectional(LSTM(100, return_state = True))(x) #return state is by default True

encoder_state = state_h

#------------------------------------------------------------------------------------------------------

encoder2_inputs = Input( input_shape, dtype = 'int32')

x = embedding_layer(encoder2_inputs)

x, state_h, state_c = Bidirectional(LSTM(100, return_state = True))()

decoder_inputs = Input( input_shape, dtype = 'int32')

_ = embedding_layer(decoder_inputs)

_ = LSTM(200, return_sequences = True)(_ ,initial_state = encoder_states)

decoder_outputs = Dense()_

model = Model([encoder1_inputs, encoder2_inputs, decoder_inputs], decoder_outputs)

model.compile(optimizer = 'adam', loss = 'categorical_crossentropy')

model.fit([indication_data ,findings_data, impression_data], impression_data, batch_size = batch_size, epochs = epochs, validation_split = 0.2)

KeyError: 8