In [None]:
from __future__ import print_function
from keras.models import Sequential
from keras.layers import Dense, Activation
from keras.layers import LSTM, GRU, Input, Flatten, Masking, merge, Reshape, Lambda, TimeDistributed, Dropout
from keras.layers.merge import Concatenate, Add
from keras import backend as K
from keras.optimizers import RMSprop, SGD, Adam
from keras.utils.data_utils import get_file
from keras.models import Model
import numpy as np
import random
import sys
import re

DROPOUT = 0.1
SENTENCE_BATCH_SIZE = 128
LSTM_WIDTH = 512
SENTENCE_START = '#'
SENTENCE_END = '_'

caps = "([A-Z])"
prefixes = "(Mr|St|Mrs|Ms|Dr)[.]"
suffixes = "(Inc|Ltd|Jr|Sr|Co)"
starters = "(Mr|Mrs|Ms|Dr|He\s|She\s|It\s|They\s|Their\s|Our\s|We\s|But\s|However\s|That\s|This\s|Wherever)"
acronyms = "([A-Z][.][A-Z][.](?:[A-Z][.])?)"
websites = "[.](com|net|org|io|gov)"

def split_into_sentences(text):
    text = " " + text + "  "
    text = text.replace("\n"," ")
    text = re.sub(prefixes,"\\1<prd>",text)
    text = re.sub(websites,"<prd>\\1",text)
    if "Ph.D" in text: text = text.replace("Ph.D.","Ph<prd>D<prd>")
    text = re.sub("\s" + caps + "[.] "," \\1<prd> ",text)
    text = re.sub(acronyms+" "+starters,"\\1<stop> \\2",text)
    text = re.sub(caps + "[.]" + caps + "[.]" + caps + "[.]","\\1<prd>\\2<prd>\\3<prd>",text)
    text = re.sub(caps + "[.]" + caps + "[.]","\\1<prd>\\2<prd>",text)
    text = re.sub(" "+suffixes+"[.] "+starters," \\1<stop> \\2",text)
    text = re.sub(" "+suffixes+"[.]"," \\1<prd>",text)
    text = re.sub(" " + caps + "[.]"," \\1<prd>",text)
    if "”" in text: text = text.replace(".”","”.")
    if "\"" in text: text = text.replace(".\"","\".")
    if "!" in text: text = text.replace("!\"","\"!")
    if "?" in text: text = text.replace("?\"","\"?")
    #if "\'" in text: text = text.replace("\'", " ")
    text = text.replace(". ","."+SENTENCE_END+"<stop> ")
    text = text.replace("? ","?"+SENTENCE_END+"<stop> ")
    text = text.replace("! ","!"+SENTENCE_END+"<stop> ")
    text = text.replace("<prd> ",".")
    sentences = text.split("<stop>")
    sentences = sentences[:-1]
    sentences = [s.strip() for s in sentences]
    out = []
    for s in sentences:
        if (len(s) > 30) and (len(s) < 500):
            out.append(SENTENCE_START+s)
    return out


path_shakespeare = get_file('shakespeare.txt', origin='http://norvig.com/ngrams/shakespeare.txt')
text_shakespeare = open(path_shakespeare).read()
text_shakespeare = text_shakespeare.lower().replace('\n', ' ').replace('=', ' ').replace(r"\\'", " ")
print('corpus length:', len(text_shakespeare))

# nltk.download()
#tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
#tokenized = tokenizer.tokenize(text)

sentences_shakespeare = np.array(split_into_sentences(text_shakespeare))
sentences_shakespeare = sorted(sentences_shakespeare, key=len)
chars = sorted(list(set("".join(sentences_shakespeare))))

print('total chars:', len(chars))
char_indices = dict((c, i) for i, c in enumerate(chars))
indices_char = dict((i, c) for i, c in enumerate(chars))

def text_generator(sentences):

    cum_count=0
    while 1:
        count=0
        cum_count += 1
        for i in range(0, len(sentences), SENTENCE_BATCH_SIZE): # len(sentences)
            print('batch number: ', count, ', cumulative batch number: ', cum_count)
            count += 1

            sentence_batch = sentences[i:i + SENTENCE_BATCH_SIZE]
            maxlen_batch = len(max(sentence_batch, key=len))

            X = np.zeros((SENTENCE_BATCH_SIZE, maxlen_batch, len(chars)), dtype=np.int32)
            y = np.zeros((SENTENCE_BATCH_SIZE, maxlen_batch, len(chars)), dtype=np.int32)
            w = np.zeros((SENTENCE_BATCH_SIZE, maxlen_batch), dtype=np.int32)

            for i, sentence in enumerate(sentence_batch):

                for t, char in enumerate(sentence):
                    X[i, t, char_indices[char]] = 1

                for t in range(len(sentence)-1):
                    taget_pos = t+1
                    y[i, t, char_indices[sentence[taget_pos]]] = 1
                    w[i, t] = 1

            yield ([X, X], y, w)



print('Build model...')


def concat_context(inputs):
    seq = inputs[0]
    c = inputs[1]
    c_tiled = K.tile(K.reshape(c, [-1, 1, 512]), (1,K.shape(seq)[1],1) )
    out = K.concatenate([seq, c_tiled], axis=2)

    boolean_mask = K.any(K.not_equal(seq, 0), axis=-1, keepdims=True)

    # K.print_tensor( out * K.cast(boolean_mask, K.floatx()) )

    return out * K.cast(boolean_mask, K.floatx())


def get_encoder(lstm_width, dropout):
    context_input = Input(shape=(None, len(chars)))
    x = Masking(mask_value=0)(context_input)
    x = GRU(lstm_width, return_sequences=True, go_backwards=True, dropout=dropout)(x)
    #xf = GRU(LSTM_WIDTH, return_sequences=True, go_backwards=False, dropout=0.0)(x)
    #x = Concatenate(axis=2)([xf, xb])
    x = GRU(lstm_width, return_sequences=True, dropout=dropout)(x)
    encoder_output = GRU(lstm_width, return_sequences=False, dropout=dropout)(x)
    
    return Model(inputs=[context_input], outputs=[encoder_output])    
    
    
def get_autoencoder(encoder, lstm_width, dropout):
    context_input = Input(shape=(None, len(chars)))
    encoder_output = encoder(context_input)
    #encoder_output = Reshape((1,lstm_width))(encoder_output)

    teacher_input = Input(shape=(None, len(chars)))
    decoder_input = Masking(mask_value=0)(teacher_input)

    context_layer1 = Lambda(concat_context)
    decoder_input_c = context_layer1([decoder_input, encoder_output])

    y1 = GRU(lstm_width, return_sequences=True, dropout=dropout)(decoder_input_c)
    y2 = GRU(lstm_width, return_sequences=True, dropout=dropout)(y1)
    y3 = GRU(lstm_width, return_sequences=True, dropout=dropout)(y2)

    context_layer2 = Lambda(concat_context)
    decoder_appended = context_layer2([y3, encoder_output])

    decoder_appended = TimeDistributed(Dense(lstm_width, activation='relu'))(decoder_appended)
    decoder_appended = TimeDistributed(Dropout(0.5))(decoder_appended)
    decoder_output = TimeDistributed(Dense(len(chars), activation='softmax'))(decoder_appended)

    return Model(inputs=[context_input, teacher_input], outputs=[decoder_output])

    
encoder = get_encoder(lstm_width=LSTM_WIDTH, dropout=DROPOUT)
shakespeare_autoencoder = get_autoencoder(encoder, lstm_width=LSTM_WIDTH, dropout=DROPOUT)

optimizer = Adam(clipnorm=1.0)
shakespeare_autoencoder.compile(loss='categorical_crossentropy', optimizer=optimizer, sample_weight_mode="temporal")


def sample(preds, temperature=1.0):
    # helper function to sample an index from a probability array
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)


shakespeare_gen = text_generator(sentences_shakespeare)

for iteration in range(1, 100):
    print()
    print('-' * 50)
    print('Iteration', iteration)

    shakespeare_autoencoder.save('model_shakespeare.hd5')
    shakespeare_history = shakespeare_autoencoder.fit_generator(shakespeare_gen, 
                            steps_per_epoch=len(sentences_shakespeare)/SENTENCE_BATCH_SIZE-10, 
                            epochs=1, verbose=1, workers=1)



corpus length: 4538523
total chars: 45
Build model...

--------------------------------------------------
Iteration 1
batch number: Epoch 1/1
 0 , cumulative batch number:  1
batch number:  1 , cumulative batch number:  1
batch number:  2 , cumulative batch number:  1
batch number:  3 , cumulative batch number:  1
batch number:  4 , cumulative batch number:  1
batch number:  5 , cumulative batch number:  1
batch number:  6 , cumulative batch number:  1
batch number:  7 , cumulative batch number:  1
batch number:  8 , cumulative batch number:  1
batch number:  9 , cumulative batch number:  1
batch number:  10 , cumulative batch number:  1
  1/288 [..............................] - ETA: 694s - loss: 3.8095batch number:  11 , cumulative batch number:  1
  2/288 [..............................] - ETA: 420s - loss: 3.7869batch number:  12 , cumulative batch number:  1
  3/288 [..............................] - ETA: 336s - loss: 3.7140batch number:  13 , cumulative batch number:  1
  4/288 [

In [16]:
# Some simple test of model prediction performance
test_set = shakespeare_gen.__next__()
prediction = shakespeare_autoencoder.predict_on_batch(test_set[0])
for i in range(len(test_set[1])):

    sentence = np.argmax(test_set[0][0][i], axis=1)
    sentence_decode = ''
    for t in range(len(sentence)):
        sentence_decode += indices_char[sentence[t]]

    frag = np.argmax(test_set[0][1][i], axis=1)
    frag_decode = ''
    for t in range(len(frag)):
        frag_decode += indices_char[frag[t]]

    predicted_symbols = np.argmax(prediction[i], axis=1)
    predicted_sec = ''
    true_symbols = np.argmax(test_set[1][i,:,:], axis=1)
    true_sec = ''
    for t in range(len(predicted_symbols)):
        predicted_sec += indices_char[predicted_symbols[t]]
        true_sec += indices_char[true_symbols[t]]
    print('Sentence: ', sentence_decode)
    print('Predicted: \"%s\"' % (predicted_sec))


batch number:  2 , cumulative batch number:  100
Sentence:  #ay , greek , that is my name ._
Predicted: "ay , greek , that is my name ._ "
Sentence:  #thou mayst tell that knowest ._
Predicted: "thou mayst tell that knowest ._ "
Sentence:  #what is he more than another ?_
Predicted: "what is he more than another ?_ "
Sentence:  #you depend upon him , i mean ?_
Predicted: "you depend upon him , i mean ?_ "
Sentence:  #nell , he is full of harmony ._
Predicted: "nell , he is full of harmony ._ "
Sentence:  #but , marry , thus , my lord ._
Predicted: "but , marry , thur , my lord ._ "
Sentence:  #come , your disposer is sick ._
Predicted: "come , your disposer is sick a_ "
Sentence:  #come , give me an instrument ._
Predicted: "come , give me an instrument ._ "
Sentence:  #hot thoughts , and hot deeds ?_
Predicted: "hot thoughts , and hot deeds ?_ "
Sentence:  #shall ajax fight with hector ?_
Predicted: "shall ajax fight with hector ?_ "
Sentence:  #on , lord ; we'll follow you ._
Predict

In [14]:
def sentence_encode(sentence):
    y = np.zeros((1, len(sentence), len(chars)), dtype=np.int32)
    for t, char in enumerate(sentence):
        y[0, t, char_indices[char]] = 1
    return y
        
test_sentence = "#fight valiantly to-day : and yet i do thee wrong to mind thee of it , for thou art fram'd of the firm truth of valour ._"
test_sentence = "#i predict that robots will be useful ._"

#test_sentence = "#have i forgotten myself so far that i have not even told you his name ?_"
#test_sentence = "#fight valiantly to-day : and yet i do thee wrong to mind thee of it ._"


#test_sentence_encoded = np.reshape(test_set[0][0][-1,:,:], (1,121,len(chars)))
test_sentence_encoded = sentence_encode(test_sentence)
#test_sentence_encoded = np.concatenate([test_sentence_encoded, np.zeros((1, 0, len(chars)))], axis=1)
test_shot = [test_sentence_encoded, test_sentence_encoded]
prediction_example = shakespeare_autoencoder.predict_on_batch(test_shot)[-1,:,:]

predicted_sec = ''
predicted_symbols = np.argmax(prediction_example, axis=1)
for t in range(len(predicted_symbols)):
    predicted_sec += indices_char[predicted_symbols[t]]
    
print('Predicted sec:')
predicted_sec


Predicted sec:


'i predent that robbti will belunedul t_i'

In [11]:
shakespeare_history.history

{'loss': [0.17918064532911077]}

In [26]:
import utils

def beam_search(model, sentence_enc, start_frag_enc, beam_width, num_candidates):
    num_chars = sentence_enc.shape[-1]
    EPS = 1e-10

    candidates = {0: start_frag_enc}
    candidate_scores = {0: 0.0}
    complete_sentences = []
    complete_scores = []


    # candidate_symbols = (-prediction).argsort(axis=None)[:beam_width]
    while len(complete_sentences) < num_candidates:

        score_matrix = -np.inf * np.ones((num_chars, beam_width))
        for i, candidate in candidates.items():
            p = model.predict([sentence_enc, candidate])[0,-1,:]
            score_matrix[:,i] = np.log(p + EPS) + candidate_scores[i]
        new_symbols = (-score_matrix).argsort(axis=None)[:beam_width]
        idxs = np.unravel_index(new_symbols, score_matrix.shape)

        count=0
        new_candidates = {}
        new_candidate_scores = {}
        for idx in zip(idxs[0], idxs[1]):
            new_symbol_enc = np.zeros((1, 1, num_chars))
            new_symbol_enc[0, 0, idx[0]] = 1
            new_candidates[count] = np.concatenate([candidates[idx[1]], new_symbol_enc], axis=1)
            new_candidate_scores[count] = score_matrix[idx[0], idx[1]]
            count += 1

        candidates = new_candidates
        candidate_scores = new_candidate_scores

        for i, candidate in candidates.items():
            sentence = utils.sentence_decode(candidate, chars)
            print("Candidate %s: " %(i), sentence, 'Score: ', candidate_scores[i])
        print()



test_sentence = '#have i forgotten myself so far that i have not even told you his name ?_'
test_sentence = 'the first dozen runners in the queue for their official numbers were chosen to meet the prince and they posed behind him as he cut the ribbon ._'

sentence_enc = utils.sentence_encode(test_sentence, chars)
fragment_enc = utils.sentence_encode('#', chars)

sentence_predicted = beam_search(shakespeare_autoencoder, sentence_enc, fragment_enc, beam_width=20, num_candidates=5)

Candidate 0:  #t Score:  -5.3526404372e-05
Candidate 1:  #s Score:  -10.6628465652
Candidate 2:  #b Score:  -11.1336717606
Candidate 3:  #w Score:  -11.405749321
Candidate 4:  #i Score:  -13.3258228302
Candidate 5:  #h Score:  -13.7132930756
Candidate 6:  #f Score:  -14.2752799988
Candidate 7:  #n Score:  -14.3762111664
Candidate 8:  #u Score:  -15.8991384506
Candidate 9:  #' Score:  -16.2054824829
Candidate 10:  #r Score:  -16.814207077
Candidate 11:  #  Score:  -17.3790950775
Candidate 12:  #y Score:  -17.3945960999
Candidate 13:  #, Score:  -17.9094238281
Candidate 14:  #a Score:  -18.0691261292
Candidate 15:  #e Score:  -18.1471004486
Candidate 16:  #m Score:  -18.2638721466
Candidate 17:  #p Score:  -18.2846908569
Candidate 18:  #o Score:  -18.6953792572
Candidate 19:  #l Score:  -19.9841079712

Candidate 0:  #th Score:  -5.49569158466e-05
Candidate 1:  #sh Score:  -10.6647815704
Candidate 2:  #be Score:  -11.1533727646
Candidate 3:  #wh Score:  -11.4071617126
Candidate 4:  #he Sc

KeyboardInterrupt: 

{'loss': [0.15348826182497002]}

In [24]:
import os 
os.listdir('.')

['.ipynb_checkpoints',
 'lstm_encoder_decoder.ipynb',
 'utils.py',
 'MNIST_Static_HyperNetwork_Example.ipynb',
 'model_shakespeare.hd5']

In [21]:
pwd

'/sandbox'