In [2]:
from keras.callbacks import LambdaCallback, TensorBoard
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.optimizers import RMSprop
from keras.utils.data_utils import get_file
import numpy as np
import random, sys, io, re, string

Using TensorFlow backend.


In [3]:
shakes_lines = []
poem_starts = []
next_ln = False
min_ = 100

with open("data/shakespeare.txt") as f:
    
    # Read in all lines
    lines = f.readlines()
    for line in lines[1:]:
        
        # replace poem breaks with ~
        if re.match('\s+\d+', line):
            shakes_lines.append('~')
            next_ln = True
            continue
            
        # get rid of blank lines
        seq = line.strip()
        if len(seq) < 3:
            continue
        else:
            min_ = len(seq)
        # remove punctuation
        seq = seq.translate(str.maketrans('', '', string.punctuation))
        # make lowercase
        seq = seq.lower()
        #print(seq)
        shakes_lines.append(seq)
        
        if next_ln:
            poem_starts.append(seq)
    
processed_text = '\n'.join(shakes_lines)
# print(processed_text[:60*20])
# print(poem_starts[:5])

maxlen = max([len(ln) for ln in processed_text.split('\n')]) + 1
print('max length: ', maxlen)
print('min length: ', min_)

window_size = 40

max length:  58
min length:  46


In [4]:
chars = sorted(list(set(processed_text)))
print('total chars:', len(chars))
print(chars)
char_index = dict((c, i) for i, c in enumerate(chars))
index_char = dict((i, c) for i, c in enumerate(chars))

total chars: 29
['\n', ' ', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '~']


In [5]:
# cut the text in semi-redundant sequences of window_size characters
step = 3
sentences = []
next_chars = []
for i in range(0, len(processed_text) - window_size, step):
    sentences.append(processed_text[i: i + window_size])
    next_chars.append(processed_text[i + window_size])

X = np.zeros((len(sentences), window_size, len(chars)), dtype=np.bool)
y = np.zeros((len(sentences), len(chars)), dtype=np.bool)

print(X.shape, y.shape)

for i, sentence in enumerate(sentences):
    for t, char in enumerate(sentence):
        try:
            X[i, t, char_index[char]] = 1
        except:
            print(i, t, char_index[char])
    y[i, char_index[next_chars[i]]] = 1

(30268, 40, 29) (30268, 29)


In [34]:
model = Sequential()
model.add(LSTM(128, input_shape=(window_size, len(chars))))
model.add(Dense(len(chars), activation='softmax'))

optimizer = RMSprop(clipnorm=1)
model.compile(optimizer, 'categorical_crossentropy', 
              metrics=['categorical_accuracy', 'categorical_crossentropy'])

In [8]:
def sample(preds, temperature=1.0):
    # helper function to sample an index from a probability array
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)

In [18]:
def generate_poem(from_seed, temp=0.5, fast=True):
    print(f'-- Generating Poem With Temperature: {temp}')

    generated = ''
    sentence = from_seed
    generated += sentence
    
    real_start = from_seed[len(from_seed) - len(from_seed.lstrip()):]
    print(f'-- From seed: \n\"{real_start}\"\n')
    
    if not fast:
        sys.stdout.write(real_start)

    lines = 1
    if not fast:
        while lines < 14:
            x_pred = np.zeros((1, window_size, len(chars)))
            for t, char in enumerate(sentence):
                x_pred[0, t, char_index[char]] = 1.

            preds = model.predict(x_pred, verbose=0)[0]
            next_char = '~'
            if next_char == '~':
                next_index = sample(preds, temp)
                next_char = index_char[next_index]

            sentence = sentence[1:] + next_char

            if not fast:
                sys.stdout.write(next_char)
                sys.stdout.flush()

            generated += next_char
            lines = len(generated.split('\n'))
    else:
        for _ in range(300):
            x_pred = np.zeros((1, window_size, len(chars)))
            for t, char in enumerate(sentence):
                x_pred[0, t, char_index[char]] = 1.

            preds = model.predict(x_pred, verbose=0)[0]
            next_index = sample(preds, temp)
            next_char = index_char[next_index]

            sentence = sentence[1:] + next_char

            if not fast:
                sys.stdout.write(next_char)
                sys.stdout.flush()

            generated += next_char
            lines = len(generated.split('\n'))
        
    if fast:
        print(generated)
    else:
        print()


In [19]:
def on_epoch_end(epoch, _, epochs_split=5):
    if epoch % epochs_split == 0:
        
        start_index = random.randint(0, len(processed_text) - window_size - 1)
        for temp in [0.2, 0.5, 1.0]:

            sentence = processed_text[start_index: start_index + window_size]
            generate_poem(sentence, temp=temp)

In [20]:
#tensorboard = TensorBoard(log_dir='./Graph', histogram_freq=0, write_graph=True)


In [35]:
print_callback = LambdaCallback(on_epoch_end=on_epoch_end)

model.fit(X, y,
          batch_size=128,
          epochs=50,
          callbacks=[print_callback])#, tensorboard])

Epoch 1/50
-- Generating Poem With Temperature: 0.2
-- From seed: 
"say more that like of hearsay well
i wi"

 say more that like of hearsay well
i win th the the the the the the too so the the the  oo the the tho the the the to the the tho the th the tho the the the the the the thi the the the the the the the the the tha the the the the the the the tho thas the the tho the the to the tho the the thot the thin tho the the se the to the the the th
-- Generating Poem With Temperature: 0.5
-- From seed: 
"say more that like of hearsay well
i wi"

 say more that like of hearsay well
i win thin noo nme thre the so heth byt iovele thont ho mo the tho thee tong a aao mne eeme 
eo ge sen too eote the  oe hin in theutoo wo thef reet lo thtoo uoflieo hou he to ti  oo aent  ind aeit ir thattths sot tho fot loe thou he too  he set han  eon sed tor tmotho st aae ton the thor fe iy tin den w
-- Generating Poem With Temperature: 1.0
-- From seed: 
"say more that like of hearsay well
i wi"

 say more 

KeyboardInterrupt: 

In [36]:
prompt = "  shall i compare thee to a summers day\n"

for t in [0.25, 0.75, 1.5]:
    generate_poem(prompt, temp=t, fast=False)

-- Generating Poem With Temperature: 0.25
-- From seed: 
"shall i compare thee to a summers day
"

shall i compare thee to a summers day
how stan the will heavent is is bearty live
the rome the stares as and dould doth live
the watse the elest forded and thee i prease
the condur in that i come the storn
when i shall dee with the dould to gue
my seef that heart that a owe the port
of in the will it beauty of thee thee
the stor for my sweet meauty of theerse
and me it love that and there or my love
~
when i shall where of they stay so love
and theng astalls the erenth then thee

-- Generating Poem With Temperature: 0.75
-- From seed: 
"shall i compare thee to a summers day
"

shall i compare thee to a summers day
that should o thought then i come to lead
then thee soull foe to but the yen sheel spert
snet i stouths weethis all ondont redigh
sain in thaguet which that chire thy will
be thenges in the your thing ond dechiese
the ewringes of the orthing thy swant ifles
thy now full aspuren 

In [37]:
def perplexity(y_true, y_pred):
    ce = -(1.0/y_true.shape[0]) * np.sum(y_true*np.log(y_pred) + (1-y_true)*np.log(1-y_pred))
    perplexity = np.exp(ce)
    return perplexity

y_pred = model.predict(X)

print(y_pred.shape)
print(y[:1000].shape)

print(perplexity(y, y_pred))

(30268, 29)
(1000, 29)
5.781428701525854
