In [1]:
import nltk
import numpy as np
import re
from nltk.tokenize import TweetTokenizer
import matplotlib.pyplot as plt
from HMM import unsupervised_HMM
from HMM_helper import (
    text_to_wordcloud,
    states_to_wordclouds,
    parse_observations,
    sample_sentence,
    visualize_sparsities,
    animate_emission,
    obs_map_reverser)
from keras.utils import to_categorical
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.callbacks import LambdaCallback
import random
import sys
import io

In [218]:
#Open and read file
file = open('data/shakespeare.txt', 'r')
text = file.read()
file.close()

In [231]:
#Split by sonnet and get rid of sonnet numbering
sonnets2 = text.split('\n\n\n')
sonnets2 = [sonnet[(sonnet.find('\n')+1):] for sonnet in sonnets2]

In [232]:
#Generate 40 character sentences for training, plus y vector of next characters
sentences = []
next_chars = []

#Length 40, step size is 3
maxlen = 40
step_size = 3
for sonnet in sonnets2:
    for i in range(0, len(sonnet)-max_len,step_size):
        sentences.append(sonnet[i:(i+max_len)])
        next_chars.append(sonnet[i+max_len])

In [234]:
#Get the unique characters and create a forward and backward mapping to indices
raw_text = ' '.join(sonnets2)
chars = sorted(list(set(raw_text)))
mapping = dict((c, i) for i, c in enumerate(chars))
indices_char = dict((i, c) for i, c in enumerate(chars))

In [236]:
#Find out the vocabulary size
vocab_size = len(mapping)
vocab_size

61

In [237]:
#Convert sentences to indices
encoded = list()
for line in sentences:
	# integer encode line
	encoded_seq = [mapping[char] for char in line]
	# store
	encoded.append(encoded_seq)

#Convert next characters to indices
encoded_nextchar = [mapping[char] for char in next_chars]

In [247]:
#One hot encode indices
sequences = [to_categorical(x, num_classes=vocab_size) for x in encoded]
X = np.array(sequences)
y = to_categorical(encoded_nextchar, num_classes=vocab_size)

In [248]:
#Define model
model = Sequential()
model.add(LSTM(150, input_shape = (X.shape[1], X.shape[2])))
model.add(Dense(vocab_size, activation = 'softmax'))
print(model.summary())

Model: "sequential_5"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_5 (LSTM)                (None, 150)               127200    
_________________________________________________________________
dense_5 (Dense)              (None, 61)                9211      
Total params: 136,411
Trainable params: 136,411
Non-trainable params: 0
_________________________________________________________________
None


In [263]:
#Train model on sentences for 100 epochs
model.compile(loss = 'categorical_crossentropy', optimizer = 'adam', metrics = ['accuracy'])
model.fit(X, y, batch_size = 32, epochs=100, verbose=2)

Epoch 1/100
 - 159s - loss: 0.8326 - accuracy: 0.7425
Epoch 2/100
 - 156s - loss: 0.7922 - accuracy: 0.7570
Epoch 3/100
 - 146s - loss: 0.7602 - accuracy: 0.7659
Epoch 4/100
 - 168s - loss: 0.7308 - accuracy: 0.7781
Epoch 5/100
 - 184s - loss: 0.7059 - accuracy: 0.7855
Epoch 6/100
 - 188s - loss: 0.6789 - accuracy: 0.7923
Epoch 7/100
 - 184s - loss: 0.6498 - accuracy: 0.8062
Epoch 8/100
 - 141s - loss: 0.6567 - accuracy: 0.8008
Epoch 9/100
 - 142s - loss: 0.5976 - accuracy: 0.8210
Epoch 10/100
 - 159s - loss: 0.5724 - accuracy: 0.8318
Epoch 11/100
 - 178s - loss: 0.5598 - accuracy: 0.8348
Epoch 12/100
 - 154s - loss: 0.5464 - accuracy: 0.8374
Epoch 13/100
 - 145s - loss: 0.5204 - accuracy: 0.8479
Epoch 14/100
 - 158s - loss: 0.5080 - accuracy: 0.8519
Epoch 15/100
 - 158s - loss: 0.4965 - accuracy: 0.8536
Epoch 16/100
 - 140s - loss: 0.4753 - accuracy: 0.8597
Epoch 17/100
 - 144s - loss: 0.4595 - accuracy: 0.8658
Epoch 18/100
 - 145s - loss: 0.4447 - accuracy: 0.8719
Epoch 19/100
 - 155

<keras.callbacks.callbacks.History at 0x252ffc95748>

In [264]:
#Save model
model.save('model_linebreak.h5')

In [255]:
#Function from https://github.com/keras-team/keras/blob/master/examples/lstm_text_generation.py, samples index from probability array with particular temperature
def sample(preds, temperature=1.0):
    # helper function to sample an index from a probability array
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)

In [303]:
#Function to generate a 14 line sonnet with a particular temperature

def generate_sonnet(seed, temperature, model, mapping, vocab_size):
    
    print('\n')
    print('Generating sonnet with temperature:', temperature,'\n')
    #Start sonnet and print it out
    sonnet = seed
    sys.stdout.write(sonnet)
    
    #Encode seed
    encode = [mapping[char] for char in seed]
    sequence = np.array([to_categorical(x, num_classes=vocab_size) for x in encode])
    
    #Write out 14 lines
    lines = 1
    while lines < 14:
        
        #Predict and write the next character
        pred = model.predict(np.array([sequence]))
        next_enc = sample(pred[0], temperature = temperature)
        next_char = indices_char[next_enc]
        sys.stdout.write(next_char)
        
        #Add next character to sequence
        sonnet = sonnet + next_char
        sequence = np.append(sequence[1:,:], np.array([to_categorical(next_enc,num_classes = vocab_size)]), axis = 0)
        
        #Update the lines
        if next_char == '\n':
            lines = lines + 1
        
    

In [304]:
#Generate a sonnet for each of the temperatures
for temp in [1.5,0.75,0.25]:
    generate_sonnet("shall i compare thee to a summer's day?\n", temp, model, mapping, vocab_size)



Generating sonnet with temperature: 1.5 

shall i compare thee to a summer's day?
Why sho shange at seeelide of my argure the stare
While by bornions brest in one rust.
And thy self could in their forle-thine own,
  And unquest and admenting ad und such ridem
Pascuse on dead fal withou, 'rournisured,
Leather that outwill love's bewell of propence:
In wind'st can pussome hear the love.
  Whise id dustile ars auspapit mid your
And my noghtan diving Timp'ss eyes with dut make me mort,
  Wist to his swite, uplad thy love,
Where and you are had the word of renst heart
Which fabbtes it,at cally discrigit:
So thould the branty, when whal obeaun'se:


Generating sonnet with temperature: 0.75 

shall i compare thee to a summer's day?
Why sho love swiends the fire in one rud.
Then im the love's be things therefteer act,
Eacusted than thou his sweet fees be undee,
They hearth is receith my awastes ime.
  'stakn anveng was sprop you is my love,
As thy sond of you your sweet see those shall,
  An

In [306]:
#Generate a sonnet for each of the temperatures
for temp in [1.5,0.75,0.25]:
    generate_sonnet("shall i compare thee to a summer's day?\n", temp, model, mapping, vocab_size)



Generating sonnet with temperature: 1.5 

shall i compare thee to a summer's day?
Why sho long amoun hast if tlat the dear prige:
Whther whal bety is that right of ender knows do prove.
For mare te frest her up michsed conce, now.
  Beauga rew ence, and loth with leass knows,
Toou of your for my sake ley love doopss:
But of the prassed confan books didi.
How with thy propers I desing worthe wond.
For you upraition and by the givent,
Make the cradiey in the bladues ame,
  Hasce I (oroutused than we I chen,
While it ferttily, Tul him that with up sweet
Give and therievent, nor to be befally:
  For where thy sweet fires not shad with sweet;


Generating sonnet with temperature: 0.75 

shall i compare thee to a summer's day?
Why sho sonf art awad your love the livers
  You gaid that menus of this his a trove
Hup, wil my soul by therefoued) not gookers:
Hawe and that les, with unsedier encet
Thee, marue mookerms thee with thou wist,
  And ecquercance I it math awad on the rus's,
O where a