In [1]:
import numpy as np
import random as rand
import os
import re
import sys

from keras.models import Sequential
from keras.layers import Dense, Dropout, LSTM
from keras.callbacks import ModelCheckpoint
from keras.utils import np_utils

Using TensorFlow backend.


In [2]:
def parse_observations(text):
    # Convert text to dataset. Treat each stanza as a sequence.
    
    line_counter = 0 
    lines = [line.strip() for line in text.split('\n')]

    obs_counter = 0
    obs = []
    obs_map = {}
    obs_elem = []

    for line in lines:
        # don't include sonnet numbers or blank lines
        if len(line) in [1, 2, 3]:
            line_counter = 0
            continue
        elif len(line) == 0:
            continue
  
        words = [word.strip() for word in line.split(' ')]
        for word in words:
            # make all words lowercase and remove punctuation
            word = re.sub(r'[^\w]', '', word).lower()
            
            if word not in obs_map:
                # Add unique words to the observations map.
                obs_map[word] = obs_counter
                obs_counter += 1
            
            # Add the encoded word.
            obs_elem.append(obs_map[word])
        
        line_counter += 1
        if line_counter % 4 == 0 or line_counter == 14:
            # Add the encoded sequence after end of each stanza
            obs.append(obs_elem)
            obs_elem = []
        
    return obs, obs_map

def parse_lines(text):
    # Convert text to dataset. Treat each line as a sequence.
    
    lines = [line.strip() for line in text.split('\n')]

    obs_counter = 0
    obs = []
    obs_map = {}

    for line in lines:
        # don't include sonnet numbers or blank lines
        if len(line) in [0, 1, 2, 3]:
            continue
  
        obs_elem = []
        words = [word.strip() for word in line.split(' ')]
        for word in words:
            # make all words lowercase and remove punctuation
            word = re.sub(r'[^\w]', '', word).lower()
            
            if word not in obs_map:
                # Add unique words to the observations map.
                obs_map[word] = obs_counter
                obs_counter += 1
            
            # Add the encoded word.
            obs_elem.append(obs_map[word])
        
        obs.append(obs_elem)
               
    return obs, obs_map

def parse_poems(text):
    # Convert text to dataset. Treat each poem as a sequence.
    
    line_counter = 0 
    lines = [line.strip() for line in text.split('\n')]

    obs_counter = 0
    obs = []
    obs_map = {}
    obs_elem = []

    for line in lines:
        # don't include sonnet numbers or blank lines
        if len(line) in [1, 2, 3]:
            line_counter = 0
            continue
        elif len(line) == 0:
            continue
  
        words = [word.strip() for word in line.split(' ')]
        for word in words:
            # make all words lowercase and remove punctuation
            word = re.sub(r'[^\w]', '', word).lower()
            
            if word not in obs_map:
                # Add unique words to the observations map.
                obs_map[word] = obs_counter
                obs_counter += 1
            
            # Add the encoded word.
            obs_elem.append(obs_map[word])
        
        line_counter += 1
        if line_counter == 14:
            # Add the encoded sequence after end of each stanza
            obs.append(obs_elem)
            obs_elem = []
        
    return obs, obs_map

In [3]:
text = open(os.path.join(os.getcwd(), 'data/shakespeare.txt')).read()

# get words as numbers and get the word map
all_words = []
words, word_to_int_map = parse_poems(text)
for i in range(len(words)):
    for j in range(len(words[i])):
        all_words.append(words[i][j])
words = all_words
int_to_word_map = {v: k for k, v in word_to_int_map.items()}


# remove sonnet numbers and convert to lowercase
n_words = len(words)
n_vocab = len(word_to_int_map)

In [4]:
# prepare the dataset of input to output pairs encoded as integers
seq_length = 7
dataX = []
dataY = []

# use sliding window approach
for i in range(0, n_words - seq_length, 1):
    seq_in = words[i:i + seq_length]
    seq_out = words[i + seq_length]
    dataX.append([word for word in seq_in])
    dataY.append(seq_out)
    
n_sequences = len(dataX)

In [5]:
# reshape X to be [samples, time steps, features]
X = np.reshape(dataX, (n_sequences, seq_length, 1))

# normalize data to range (0, 1)
X = X / float(n_vocab)

# one hot encode the output variable
y = np_utils.to_categorical(dataY)

In [6]:
# define the LSTM model 
# (single layer with 150 units, followed by dense output layer)
model = Sequential()
model.add(LSTM(150, input_shape=(X.shape[1], X.shape[2])))
model.add(Dropout(0.2))
model.add(Dense(y.shape[1], activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam')

In [7]:
# load the network weights from best checkpoint
filename = "weights-improvement-word-20-6.3359.hdf5"
model.load_weights(filename)
model.compile(loss='categorical_crossentropy', optimizer='adam')

In [8]:
def sample(preds, temp=1.0):
    '''
    Helper function for sampling from softmax with different temperatures.
    
    Inputs:
    preds: output of softmax function
    temp: temperature to scale by
    '''
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temp
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    
    return np.argmax(probas)

In [16]:
# set seed for emissions
seed_to_int = [rand.randint(0, n_words) for i in range(seq_length)]
# generate emissions
pattern = seed_to_int
temps = [0.1, 0.25, 0.75, 1.5, 2, 2.5, 3, 5, 10]

for temp in temps:    
    print('Temperature', temp)
    
    # generate characters
    newlines = 0
    words = 0
    while newlines < 14:
        x = np.reshape(pattern, (1, len(pattern), 1))
        x = x / float(n_vocab)
        prediction = model.predict(x, verbose=0)[0]

        # sample according to temperature
        idx = sample(prediction, temp)
        
        #index = np.argmax(prediction[0])
        result = int_to_word_map[idx]
        pattern.append(idx)
        pattern = pattern[1:len(pattern)]
        
        if words < seq_length:
            words += 1
            sys.stdout.write(result)
            sys.stdout.write(' ')
            sys.stdout.flush()
        elif words == seq_length:
            sys.stdout.write('\n')
            sys.stdout.flush()
            newlines += 1
            words = 0
        
        # output result 
        
    print()

Temperature 0.1
and the the my the the the 
thy thy and i and my my 
love and and the thy my the 
thou the i my thou thy my 
the my thou thy my thy i 
my thou i to i to thou 
thy thy thou my thou thou thy 
thou thou thy thou thy thou thou 
thy i my thy thy thy thy 
my thy thy my and thou thou 
my thy thou the thy i that 
my i thy thou the thy thy 
thy i my thy my my thy 
thy thou the thou thou thy thy 

Temperature 0.25
thy thou my my the my thee 
thou thou self thy that a my 
to i and i and my for 
the my love that the i thy 
in my to thou eyes so thy 
so thy that and i and thy 
thy the i thou that the the 
for when my thou the that thou 
in that that the my and my 
thou the i thy thou thou my 
the the that in thou so thy 
it to and to but that to 
the i of to my to to 
that thou the i my in i 

Temperature 0.75
have time i for but a as 
so that as heat so this mayst 
spring be good thoughts if thou with 
thou to times in thy with grief 
a prove self i so and golden 
live show that th