In [2]:
import pickle
import numpy as np
import keras
from keras.models import Model
from keras.layers import Input, LSTM, Dense,GRU

Using TensorFlow backend.


In [3]:
def limit_sentences(txt_str,num_words):
    
    return ' '.join(txt_str.lower().split(' ')[:num_words]).replace('\n',' ')

In [4]:
query_response = pickle.load(open("../data/query_response_direct_one_sentence.p","rb"))

In [5]:
query_response_limited = []
for (q,r) in query_response:
    if (len(str(q).strip().split()) >= 1) and (len(str(r).strip().split()) >= 1):
        query_response_limited.append((limit_sentences(str(q),num_words = 30),limit_sentences(str(r),num_words = 15)))

In [6]:
print(len(query_response))
print(len(query_response_limited))

3999
3925


In [7]:
lines = query_response_limited

In [8]:
%%time
batch_size = 16  # Batch size for training.
epochs = 100  # Number of epochs to train for.
latent_dim = 256  # Latent dimensionality of the encoding space.
num_samples = 300 # Number of samples to train on.


#Vectorize the data for words
input_texts = []
target_texts = []
input_words = set()
target_words = set()


# Option 2
for seq1, seq2 in lines[: min(num_samples, len(lines) - 1)]:
    input_text, target_text = seq1, seq2
    # We use "tab" as the "start sequence" character
    # for the targets, and "\n" as "end sequence" character.
    target_text = '<BOS> ' + target_text + ' <EOS>' #'<BOS>' + target_text + '<EOS>'
    input_texts.append(input_text)
    target_texts.append(target_text)
    
    # Only one input word
    input_text_list = input_text.split()
    if len(input_text_list) == 1:
        input_words.add(input_text_list[0])
    # Multiple input words
    else:
        for word in input_text_list:
            input_words.add(word)
                
    # For right now we are only looking at 1 target...the medication
    target_text_list = target_text.split()
    for word in target_text_list:
        target_words.add(word)

# Option 2
input_words = sorted(list(input_words))
target_words = sorted(list(target_words))
num_encoder_tokens = len(input_words)
num_decoder_tokens = len(target_words)
# set max input sequence (may need to pad...)
max_encoder_seq_length = max([len(txt) for txt in input_texts])
max_decoder_seq_length = max([len(txt) for txt in target_texts])

print('Number of samples:', len(input_texts))
print('Number of unique input tokens:', num_encoder_tokens)
print('Number of unique output tokens:', num_decoder_tokens)
print('Max sequence length for inputs:', max_encoder_seq_length)
print('Max sequence length for outputs:', max_decoder_seq_length)

Number of samples: 300
Number of unique input tokens: 499
Number of unique output tokens: 894
Max sequence length for inputs: 198
Max sequence length for outputs: 155
CPU times: user 3.7 ms, sys: 449 µs, total: 4.15 ms
Wall time: 4.06 ms


In [9]:
# Option 2
input_token_index = dict(
    [(char, i) for i, char in enumerate(input_words)])
target_token_index = dict(
    [(char, i) for i, char in enumerate(target_words)])

encoder_input_data = np.zeros(
    (len(input_texts), max_encoder_seq_length, num_encoder_tokens),
    dtype='float32')
decoder_input_data = np.zeros(
    (len(input_texts), max_decoder_seq_length, num_decoder_tokens),
    dtype='float32')
decoder_target_data = np.zeros(
    (len(input_texts), max_decoder_seq_length, num_decoder_tokens),
    dtype='float32')

for i, (input_text, target_text) in enumerate(zip(input_texts, target_texts)):

    input_text_list = input_text.split()
    if len(input_text_list) > 1:
        for t, word in enumerate(input_text_list):
            encoder_input_data[i, t, input_token_index[word]] = 1.
    else:
        encoder_input_data[i, t, input_token_index[input_text_list[0]]] = 1.
    
    target_text_list = target_text.split()
    for t, word in enumerate(target_text_list):
        decoder_input_data[i, t, target_token_index[word]] = 1.
        if t > 0:
            # decoder_target_data will be ahead by one timestep
            # and will not include the start character.
            decoder_target_data[i, t - 1, target_token_index[word]] = 1.



## GRU

In [None]:
encoder_inputs = Input(shape=(None, num_encoder_tokens))
encoder = GRU(latent_dim, return_state=True)
encoder_outputs, state_h = encoder(encoder_inputs)

decoder_inputs = Input(shape=(None, num_decoder_tokens))
decoder_gru = GRU(latent_dim, return_sequences=True)
decoder_outputs = decoder_gru(decoder_inputs, initial_state=state_h)
decoder_dense = Dense(num_decoder_tokens, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

# Define the model that will turn
# `encoder_input_data` & `decoder_input_data` into `decoder_target_data`
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
# Run training
model.compile(optimizer='rmsprop', loss='categorical_crossentropy')
model.fit([encoder_input_data, decoder_input_data], decoder_target_data,
          batch_size=batch_size,
          epochs=epochs,
          validation_split=0.1)

Train on 270 samples, validate on 30 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
 32/270 [==>...........................] - ETA: 9s - loss: 0.2218 

## LSTM

In [None]:
# Define an input sequence and process it.
encoder_inputs = Input(shape=(None, num_encoder_tokens))
encoder = LSTM(latent_dim, return_state=True)
encoder_outputs, state_h, state_c = encoder(encoder_inputs)
# We discard `encoder_outputs` and only keep the states.
encoder_states = [state_h, state_c]

# Set up the decoder, using `encoder_states` as initial state.
decoder_inputs = Input(shape=(None, num_decoder_tokens))
# We set up our decoder to return full output sequences,
# and to return internal states as well. We don't use the 
# return states in the training model, but we will use them in inference.
decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_inputs,
                                     initial_state=encoder_states)
decoder_dense = Dense(num_decoder_tokens, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

# Define the model that will turn
# `encoder_input_data` & `decoder_input_data` into `decoder_target_data`
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
# Run training
model.compile(optimizer='rmsprop', loss='categorical_crossentropy')
model.fit([encoder_input_data, decoder_input_data], decoder_target_data,
          batch_size=batch_size,
          epochs=epochs,
          validation_split=0.1)

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline

# summarize history for loss
plt.plot(model.history.history['loss'])
plt.plot(model.history.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()

In [None]:
# Option 1
for seq_index in range(100):
    # Take one sequence (part of the training set)
    # for trying out decoding.
    input_seq = encoder_input_data[seq_index: seq_index + 1]
    target_seq =decoder_input_data[seq_index: seq_index + 1]
    decoded_sentence = decode_sequence(input_seq)
    print('-')
    print('Input sentence:', input_texts[seq_index])
    print('Decoded sentence:', decoded_sentence)