# Train word embeddings

In [1]:
import numpy as np
from keras.preprocessing.text import Tokenizer
from keras.utils import to_categorical
from keras.utils import pad_sequences
from keras.models import Sequential
from keras.models import load_model
from keras.layers import Dense, LSTM, Bidirectional
from keras.optimizers import Adam
from keras.losses import CategoricalCrossentropy
from tensorflow.keras.losses import Reduction
from keras import Input
from gensim.models import Word2Vec
from gensim.models import KeyedVectors

In [2]:
# Constants

HAIKU_BEGIN = "<h>"
HAIKU_END = "</h>"
LINE_BEGIN = "<s>"
LINE_END = "</s>"

NGRAM_SIZE = 7
EMBEDDING_SIZE = 200

BATCH_SIZE = 128

INPUT_UNITS = (NGRAM_SIZE - 1) * EMBEDDING_SIZE

EPOCHS = 3

line_structure = {1 : 5,
                 2 : 7,
                 3 : 5}

In [3]:
word_to_syllable = {}

with open("data/phoneticDictionary.csv", 'r', encoding='utf_8') as f:
    f.readline()
    for line in f.readlines():
        cols = line.split(',')
        word_to_syllable[cols[1].strip("\"")] = int(cols[3])
print(len(word_to_syllable))

125927


In [4]:
# as per meeting w/ felix, training on whole haiku so it learns the structure
# results = lists of tokenized haiku, with poem and line separator tokens:
# [[<H>,<S>,stanza 1,</S>,<S>,stanza 2,</S>,<S>,stanza 3,</S>,</H>],...]

haiku_loc = "data/haiku_reddit.txt"
reddit_tokens = []
with open(haiku_loc, 'r', encoding='utf-8') as f:
  for line in f:
    tokens = []
    # remove trailing spaces and end-of-poem $/n marker
    stanzas = [s.strip(' $\n') for s in line.split("/")]
    tokens += [HAIKU_BEGIN] * (NGRAM_SIZE - 1)
    
    for stanza in stanzas:
      tokens.append(LINE_BEGIN)
      # whitespace split rather than NLTK tokenize because I don't know if the
      # syllable dictionary has entries for nonword NLTK tokens (eg 'll n't)
      tokens.extend(stanza.split())
      tokens.append(LINE_END)
      
    tokens += [HAIKU_END] * (NGRAM_SIZE - 1)
    reddit_tokens.append(tokens)

print(reddit_tokens[0:5])

[['<h>', '<h>', '<h>', '<h>', '<h>', '<h>', '<s>', 'delicate', 'savage', '</s>', '<s>', "you'll", 'never', 'hold', 'the', 'cinder', '</s>', '<s>', 'but', 'still', 'you', 'will', 'burn', '</s>', '</h>', '</h>', '</h>', '</h>', '</h>', '</h>'], ['<h>', '<h>', '<h>', '<h>', '<h>', '<h>', '<s>', 'our', 'destination', '</s>', '<s>', 'the', 'skyline', 'of', 'this', 'city', '</s>', '<s>', 'shining', 'horizon', '</s>', '</h>', '</h>', '</h>', '</h>', '</h>', '</h>'], ['<h>', '<h>', '<h>', '<h>', '<h>', '<h>', '<s>', 'a', 'splash', 'and', 'a', 'cry', '</s>', '<s>', 'words', 'pulled', 'from', 'the', 'riverside', '</s>', '<s>', 'dried', 'in', 'the', 'hot', 'sun', '</s>', '</h>', '</h>', '</h>', '</h>', '</h>', '</h>'], ['<h>', '<h>', '<h>', '<h>', '<h>', '<h>', '<s>', 'hurt', 'but', 'poised', 'for', 'war', '</s>', '<s>', 'sturdy', 'in', 'crestfallen', 'slumps', '</s>', '<s>', 'warrior', 'spirit', '</s>', '</h>', '</h>', '</h>', '</h>', '</h>', '</h>'], ['<h>', '<h>', '<h>', '<h>', '<h>', '<h>', '

Train embeddings

In [5]:
"""
Trains a word2vec model on the given sentences. Returns the trained word embeddings as a KeyedVectors object.
Function provided from HW4 starter code.
"""
def train_model(sentences, sg=1, window_size=5, vector_size=EMBEDDING_SIZE, min_count=1) :
  model = Word2Vec(sentences=sentences, vector_size=vector_size, window=window_size, min_count=min_count, sg=sg)
  return model.wv

reddit_haiku_embs = train_model(reddit_tokens)

In [6]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(reddit_tokens)
encoded = tokenizer.texts_to_sequences(reddit_tokens)

print(encoded[0:5])

[[1, 1, 1, 1, 1, 1, 3, 1444, 3133, 4, 3, 918, 68, 334, 5, 7333, 4, 3, 28, 62, 11, 33, 555, 4, 2, 2, 2, 2, 2, 2], [1, 1, 1, 1, 1, 1, 3, 57, 3134, 4, 3, 5, 2068, 12, 26, 451, 4, 3, 796, 615, 4, 2, 2, 2, 2, 2, 2], [1, 1, 1, 1, 1, 1, 3, 7, 2069, 13, 7, 437, 4, 3, 81, 1568, 36, 5, 7334, 4, 3, 2304, 10, 5, 274, 65, 4, 2, 2, 2, 2, 2, 2], [1, 1, 1, 1, 1, 1, 3, 518, 28, 4983, 16, 506, 4, 3, 3853, 10, 7335, 7336, 4, 3, 2305, 438, 4, 2, 2, 2, 2, 2, 2], [1, 1, 1, 1, 1, 1, 3, 3854, 1114, 538, 4, 3, 1005, 3855, 3135, 1275, 4, 3, 41, 5, 2070, 1569, 4, 2, 2, 2, 2, 2, 2]]


In [7]:
def generate_ngram_training_samples(encoded: list) -> list:
    '''
    Takes the encoded data (list of lists) and 
    generates the training samples out of it.
    Parameters:
    up to you, we've put in what we used
    but you can add/remove as needed
    return: 
    tuple of (training_x, training_y) in the format [[1, 2, 3], [2, 3, 2], ...] and [2, 4, ...]
    '''
    training_x = []
    training_y = []

    for sentence in encoded:
      for i in range(len(sentence) - NGRAM_SIZE + 1):
        training_x.append(sentence[i:i + NGRAM_SIZE - 1])
        training_y.append(sentence[i + NGRAM_SIZE - 1])

    return (training_x, training_y)

In [8]:
training_x, training_y = generate_ngram_training_samples(encoded)

print(training_x[0:5])
print(np.shape(training_x))
print(training_y[0:5])

[[1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 3], [1, 1, 1, 1, 3, 1444], [1, 1, 1, 3, 1444, 3133], [1, 1, 3, 1444, 3133, 4]]
(276808, 6)
[3, 1444, 3133, 4, 3]


In [9]:
def create_word_to_embedding(embs: KeyedVectors) -> dict:
    """
    Creates a mapping from each word in the embedding vocabulary to its embedding.
    """
    word_to_embedding = {}
    for word in embs.key_to_index.keys():
      word_to_embedding[word] = embs[word]
    return word_to_embedding

def create_index_to_embedding(embs: KeyedVectors, tokenizer: Tokenizer) -> dict:
  """
  Creates a mapping from the tokenizer index of each word in the embedding vocabulary to its embedding.
  """
  index_to_embedding = {}
  for word in embs.key_to_index.keys():
    index = tokenizer.word_index[word]
    index_to_embedding[index] = embs[word]
  return index_to_embedding

def get_word_to_index(word: str, tokenizer: Tokenizer):
  return tokenizer.texts_to_sequences([[word]])[0][0]

In [10]:
word_to_embedding = create_word_to_embedding(reddit_haiku_embs)
index_to_embedding = create_index_to_embedding(reddit_haiku_embs, tokenizer)
print(len(index_to_embedding))

14269


In [11]:
def data_generator(X: list, y: list, num_sequences_per_batch: int, i_to_emb: dict):
    '''
    Returns data generator to be used by feed_forward
    https://wiki.python.org/moin/Generators
    https://realpython.com/introduction-to-python-generators/
    
    Yields batches of embeddings and labels to go with them.
    Use one hot vectors to encode the labels 
    (see the to_categorical function)

    Requires a mapping to convert from tokenizer index to embedding vector.
    
    '''  
    embs = []
    labels = []
    i = 0
    while True:
        i = i % len(X)
        
        emb = [i_to_emb[n] for n in X[i]]  # [ [..200..], [..200..] ] list of lists, shape (n-1, embedding_size)
        embs.append(emb)  # list of list of lists, shape (batch_size, n-1, emb_size)
        # we want shape (batch_size, (n-1)*emb_size)

        # create one-hot vector with the 1 at the location of the tokenizer index
        # adding 1 to length to account for vector indices starting from 1 instead of 0
        label = to_categorical(y[i], num_classes=len(i_to_emb)+1)
        labels.append(label)
        if len(embs) % num_sequences_per_batch == 0:
            yield (np.reshape(embs, (num_sequences_per_batch, -1)), np.array(labels))
            embs = []
            labels = []

        i += 1

        
"""
sample = next(data_generator(rnn_training_x, rnn_training_y, 2, index_to_embedding))
sample = next(data_generator(training_x, training_y, 33, index_to_embedding))
print(sample)
print(np.shape(sample[0])) # batch_size, emb_size * n-1 -- (concatenated embeddings of n-1-word sample)
print(np.shape(sample[1])) # batch_size, len(index_to_embedding) -- (a one-hot vector for each nth word result)
"""


'\nsample = next(data_generator(rnn_training_x, rnn_training_y, 2, index_to_embedding))\nsample = next(data_generator(training_x, training_y, 33, index_to_embedding))\nprint(sample)\nprint(np.shape(sample[0])) # batch_size, emb_size * n-1 -- (concatenated embeddings of n-1-word sample)\nprint(np.shape(sample[1])) # batch_size, len(index_to_embedding) -- (a one-hot vector for each nth word result)\n'

In [12]:
train_generator = data_generator(training_x, training_y, BATCH_SIZE, index_to_embedding)

# Model 1: feedforward NN

In [13]:
def build_feed_forward_model(input_units, hidden_units, output_units):
  model = Sequential()
  
  model.add(Input(shape=(input_units,)))  # inputs will be vectors of this length, batch size not specified
  model.add(Dense(hidden_units, activation="softmax"))
  model.add(Dense(output_units, activation="softmax"))
  
  model.compile(optimizer=Adam(learning_rate=0.01), loss=CategoricalCrossentropy())
  return model

In [14]:
output_units = len(reddit_haiku_embs.key_to_index.keys()) + 1
hidden_units = 1000 #round((INPUT_UNITS + output_units) / 2)

feed_forward_model = build_feed_forward_model(INPUT_UNITS, hidden_units, output_units)


In [15]:
#feed_forward_model.fit(x=train_generator, epochs=EPOCHS, steps_per_epoch=len(training_x) // BATCH_SIZE)

In [16]:
#feed_forward_model.save("ffnn_model_trained")

# Model 2: RNN

In [17]:
# lstm produces 1 label per timestep, so timestep = n-1-gram 
# timestep size = NGRAM_SIZE - 1 (4)
# timestep consists of n-1 word embeddings, each with some features
# features = EMBEDDING_SIZE (200)
# then we can do this for a certain number of batches, lets say 128 still
# the inputs to the LSTM need to have shape (batch_size, timesteps, features)
# so for us that means shape (BATCH_SIZE, NGRAM_SIZE-1, EMBEDDING_SIZE)

# upshot: have to make a new data generator, sicne the FFNN one squished all the embeddings together

# timestep = ngram length is supported by several example articles: 
# http://ethen8181.github.io/machine-learning/keras/rnn_language_model_basic_keras.html

In [18]:
def rnn_data_generator(X: list, y: list, batch_size: int, i_to_emb: dict):
    '''
    Produces a data generator for an RNN.
    Output data is of shape (batch_size, len(X[0]), len(i_to_emb.values()[1])
    i.e. (batch_size, ngram_size - 1, embedding_size)
    Output labels are one-hot vectors of shape (batch_size, len(i_to_emb.keys())+1)
    i.e. (batch_size, vocab_size) 
    '''  
    embs = []
    labels = []
    i = 0
    while True:
        i = i % len(X)
        
        emb = [i_to_emb[n] for n in X[i]]  # [ [..200..], [..200..] ] list of lists, shape (n-1, embedding_size)
        embs.append(emb)  # list of list of lists, shape (batch_size, n-1, emb_size)

        # create one-hot vector with the 1 at the location of the tokenizer index
        # adding 1 to length to account for vector indices starting from 1 instead of 0
        label = to_categorical(y[i], num_classes=len(i_to_emb)+1)
        labels.append(label)
        if len(embs) % batch_size == 0:
            #yield (np.array(embs), np.reshape(labels, (batch_size, len(i_to_emb)+1, 1)))
            yield (np.array(embs), np.array(labels))
            embs = []
            labels = []

        i += 1

rnn_training_generator = rnn_data_generator(training_x, training_y, BATCH_SIZE, index_to_embedding)
sample = next(rnn_training_generator)
print(np.shape(sample[0]))
print(np.shape(sample[1]))

(128, 6, 200)
(128, 14270)


In [19]:
def build_rnn_model(timestep_size, input_units, hidden_units, output_units):
    model = Sequential()
    
    # input size needs to be a tuple of (timesteps, features), 
    # per https://towardsdatascience.com/a-practical-guide-to-rnn-and-lstm-in-keras-980f176271bc
    model.add(Input(shape=(timestep_size, input_units)))  # (4, 200)
    model.add(LSTM(hidden_units))
    model.add(Dense(output_units, activation="softmax"))
    
    model.compile(optimizer=Adam(learning_rate=0.01), loss=CategoricalCrossentropy())
    return model

In [20]:
hidden_units = 128 # a lot of the examples used 128
output_units = len(reddit_haiku_embs.key_to_index.keys()) + 1

rnn_model = build_rnn_model(NGRAM_SIZE - 1, EMBEDDING_SIZE, hidden_units, output_units)
print(rnn_model.summary())

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm (LSTM)                 (None, 128)               168448    
                                                                 
 dense_2 (Dense)             (None, 14270)             1840830   
                                                                 
Total params: 2,009,278
Trainable params: 2,009,278
Non-trainable params: 0
_________________________________________________________________
None


In [28]:
rnn_model_large = build_rnn_model(NGRAM_SIZE - 1, EMBEDDING_SIZE, 500, output_units)
rnn_model_large.fit(x=rnn_training_generator, epochs=EPOCHS, steps_per_epoch=len(training_x) // BATCH_SIZE)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x19ca1d5c9d0>

In [21]:
#y = rnn_model(next(rnn_training_generator)[0])
#print(y)  # output of model is (batch_size, vocab_size), i.e. a one-hot vector for each timestep
#print(rnn_model.summary())

In [22]:
rnn_model.fit(x=rnn_training_generator, epochs=EPOCHS, steps_per_epoch=len(training_x) // BATCH_SIZE)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x19ccf541130>

In [None]:
#rnn_model.save("rnn_model_n_5")  # NGRAM_SIZE = 5, 128 hidden units
#rnn_model.save("rnn_model_trained")  # NGRAM_SIZE = 7, 128 hidden units
#rnn_model.save("rnn_model_large_trained") # NGRAM_SIZE = 7, 500 hidden units

# Generate Haikus

In [23]:
# load saved models
feed_forward_model = load_model("ffnn_model_trained")
#rnn_model_small = load_model("rnn_model_trained")
rnn_model_small = rnn_model
#rnn_model_large = load_model("rnn_model_large_trained")

In [29]:
def predict_data_generator(X: list, num_sequences_per_batch: int, i_to_emb: dict, is_rnn=False) -> list:
    '''
    Returns data generator to be used for prediction data
    
    Yields batches of embeddings to go with them.
    Use one hot vectors to encode the labels 
    (see the to_categorical function)

    Requires a mapping to convert from tokenizer index to embedding vector.
    
    '''  
    embs = []
    for i in range(len(X)):
        emb = [i_to_emb[n] for n in X[i]]  # [ [..200..], [..200..] ] list of lists, shape (n-1, embedding_size)
        embs.append(emb)  # list of list of lists, shape (batch_size, n-1, emb_size)
        # we want shape (batch_size, (n-1)*emb_size)

        # create one-hot vector with the 1 at the location of the tokenizer index
        if len(embs) % num_sequences_per_batch == 0:
            if is_rnn:
                yield np.array(embs)
            else:
                yield np.reshape(embs, (num_sequences_per_batch, -1))
            embs = []


def generate_haiku(model: Sequential, 
                 tokenizer: Tokenizer, 
                 seed: list,
                 i_to_emb: dict,
                 n_words: int,
                  is_rnn=False):
    '''
    Generate a haiku from the given model
    
    Parameters:
        model: your neural network
        tokenizer: the keras preprocessing tokenizer
        seed: [w1, w2, w(n-1)]
        n_words: generate a sentence of length n_words
    Returns: string sentence
    '''
    sentence = seed
    sentence_indices = tokenizer.texts_to_sequences([seed])[0]

    # make the input list for the model.predict
    # format is the n_grams so [[1, 2], [2, 3], [3, 4] ...]
    predict_input = []
    for i in range(len(sentence_indices) - NGRAM_SIZE + 2):
        predict_input += [sentence_indices[i:i + NGRAM_SIZE]]
    
    
    n_words_generated = 0
    while n_words_generated < n_words:
        #print("predict input: ", predict_input)
        if is_rnn:
            gen = predict_data_generator(predict_input, len(predict_input), i_to_emb, is_rnn=True)
        else:
            gen = predict_data_generator(predict_input, len(predict_input), i_to_emb)
        
        probabilities = model.predict(x=gen, verbose=None)[0]

        all_word_counts = [i for i in range(len(i_to_emb.keys()) + 1)]
        sampled_index = np.random.choice(all_word_counts, p=probabilities)
        new_word = tokenizer.sequences_to_texts([[sampled_index]])[0]
      
        sentence.append(new_word)
        sentence_indices.append(sampled_index)
        predict_input.append(sentence_indices[-(NGRAM_SIZE - 1):])
        predict_input = predict_input[1:]
      
        if sentence[-1] == HAIKU_END:
            break
      
        n_words_generated += 1
    

    return " ".join(sentence)


In [25]:
def get_syllables(sentence: list, syllable_dictionary: dict):
    '''
    Counts the number of syllables in the given sentence.
    Unseen tokens return an error value of None, ensuring that the model cannot greedily end a line too early.
    '''
    
    count = 0
    for word in sentence:
        if syllable_dictionary.get(word) is None:
            return None
        else:
            count += syllable_dictionary.get(word)
    
    return count

def generate_haiku_greedy(model: Sequential, tokenizer: Tokenizer, i_to_emb: dict, syllable_dict: dict, seed=None, is_rnn=False):
    '''
    Generates a haiku from the model, ensuring a syllable fit using a greedy algorithm.
    
    Seed (optional) should be a list of tokens of length NGRAM_SIZE - 1 for the model to predict from.
    If not specified, NGRAM_SIZE-1 haiku begin tokens will be used.
    '''
    if seed is None:
        haiku = [HAIKU_BEGIN] * (NGRAM_SIZE - 1)
    else:
        haiku = seed
    
    haiku.append(LINE_BEGIN)
    haiku_indices = tokenizer.texts_to_sequences([haiku])[0]

    
    line_number = 1
    line = []
    
    while True:
        # get the ngram window to generate the next word
        window_indices = haiku_indices[-(NGRAM_SIZE - 1):]
        # convert to embeddings
        window_embs = np.reshape([i_to_emb[i] for i in window_indices], (1, NGRAM_SIZE - 1, -1))
        if not is_rnn:
            window_embs = np.reshape(window_embs, (1, -1))
        
        probabilities = model.predict(x=window_embs, verbose=None)[0]
        
        
        all_word_indices = [i for i in range(len(i_to_emb.keys()) + 1)]
        sampled_index = np.random.choice(all_word_indices, p=probabilities)
        new_word = tokenizer.sequences_to_texts([[sampled_index]])[0]
        
        line_syllables = get_syllables(line, syllable_dict)
        
        # if the one we got doesn't fit/can't be found, sample w/o replacement until one does fit
        while (get_syllables([new_word], syllable_dict) is None 
               or get_syllables([new_word], syllable_dict) > line_structure[line_number] - line_syllables):
            sampled_index = np.random.choice(all_word_indices, p=probabilities, replace=False)
            new_word = tokenizer.sequences_to_texts([[sampled_index]])[0]
        
        if line_syllables + get_syllables([new_word], syllable_dict) == line_structure[line_number]:
            haiku.append(new_word)
            haiku.append(LINE_END)
            haiku_indices.append(sampled_index)
            haiku_indices.append(get_word_to_index(LINE_END, tokenizer))  # end the current line
            
            if line_number == 3:  # end the poem
                haiku.extend([HAIKU_END] * (NGRAM_SIZE-1))
                return ' '.join(haiku)
            else:  
                haiku.append(LINE_BEGIN)  # start a new line
                haiku_indices.append(get_word_to_index(LINE_BEGIN, tokenizer))
                line_number += 1
                line = []
        
        elif line_syllables + get_syllables([new_word], syllable_dict) < line_structure[line_number]:
            haiku.append(new_word)
            haiku_indices.append(sampled_index)
            line.append(new_word)
                   

In [None]:
def print_haiku(haiku: str):
    haiku = haiku.replace("<h> ", "")
    haiku = haiku.replace("</h> ", "")
    haiku = haiku.replace("<s> ", "")
    haiku = haiku.replace("<s>", "\n")
    print(haiku)

In [32]:
for _ in range(20):
    ffnn_haiku = generate_haiku(feed_forward_model, tokenizer, [HAIKU_BEGIN] * (NGRAM_SIZE - 1), index_to_embedding, 30)
    print(ffnn_haiku)

<h> <h> <h> <h> <h> <h> </s> <s> i </s> </h>
<h> <h> <h> <h> <h> <h> </s> </h>
<h> <h> <h> <h> <h> <h> longing stress cannot </s> </s> </h>
<h> <h> <h> <h> <h> <h> goodbye time their his mist cavern reason on by truth no </s> fat </s> heretodayreally so snow </s> bow be the </s> assist we </h>
<h> <h> <h> <h> <h> <h> would i black beautiful heads </s> you teach seek starts on i were drown crave </s> rime <s> </s> </h>
<h> <h> <h> <h> <h> <h> in <s> the </s> free wings mounds eat beat i've weather woman world </s> within flora </s> <s> and sometimes not can by desires <s> </s> </h>
<h> <h> <h> <h> <h> <h> strangers throttle icarus turns the </s> progress than </s> <s> animals </s> <s> </s> </s> </h>
<h> <h> <h> <h> <h> <h> at september box </s> presence pulls </s> a petrol my the will with <s> <s> </s> <s> much over sense weighs bad </s> <s> my up chilly but pain </s>
<h> <h> <h> <h> <h> <h> i back </s> <s> while wrapped </s> of </h>
<h> <h> <h> <h> <h> <h> i have </s> </h>
<h> <h> <h> 

In [26]:
for _ in range(20):
    rnn_haiku = generate_haiku(rnn_model_small, tokenizer, [HAIKU_BEGIN] * (NGRAM_SIZE - 1), index_to_embedding, 30, is_rnn=True)
    print(rnn_haiku)

predict input:  [[1, 1, 1, 1, 1, 1]]
predict input:  [[1, 1, 1, 1, 1, 3]]
predict input:  [[1, 1, 1, 1, 3, 295]]
predict input:  [[1, 1, 1, 3, 295, 3564]]
predict input:  [[1, 1, 3, 295, 3564, 237]]
predict input:  [[1, 3, 295, 3564, 237, 4]]
predict input:  [[3, 295, 3564, 237, 4, 3]]
predict input:  [[295, 3564, 237, 4, 3, 3850]]
predict input:  [[3564, 237, 4, 3, 3850, 3101]]
predict input:  [[237, 4, 3, 3850, 3101, 3101]]
predict input:  [[4, 3, 3850, 3101, 3101, 1274]]
predict input:  [[3, 3850, 3101, 3101, 1274, 3101]]
predict input:  [[3850, 3101, 3101, 1274, 3101, 1274]]
predict input:  [[3101, 3101, 1274, 3101, 1274, 2389]]
predict input:  [[3101, 1274, 3101, 1274, 2389, 4]]
predict input:  [[1274, 3101, 1274, 2389, 4, 3]]
predict input:  [[3101, 1274, 2389, 4, 3, 13]]
predict input:  [[1274, 2389, 4, 3, 13, 723]]
predict input:  [[2389, 4, 3, 13, 723, 38]]
predict input:  [[4, 3, 13, 723, 38, 5]]
predict input:  [[3, 13, 723, 38, 5, 1163]]
predict input:  [[13, 723, 38, 5, 11

predict input:  [[1, 1, 1, 1, 3, 41]]
predict input:  [[1, 1, 1, 3, 41, 303]]
predict input:  [[1, 1, 3, 41, 303, 73]]
predict input:  [[1, 3, 41, 303, 73, 4]]
predict input:  [[3, 41, 303, 73, 4, 3]]
predict input:  [[41, 303, 73, 4, 3, 1263]]
predict input:  [[303, 73, 4, 3, 1263, 51]]
predict input:  [[73, 4, 3, 1263, 51, 1387]]
predict input:  [[4, 3, 1263, 51, 1387, 4]]
predict input:  [[3, 1263, 51, 1387, 4, 3]]
predict input:  [[1263, 51, 1387, 4, 3, 13621]]
predict input:  [[51, 1387, 4, 3, 13621, 1476]]
predict input:  [[1387, 4, 3, 13621, 1476, 4936]]
predict input:  [[4, 3, 13621, 1476, 4936, 799]]
predict input:  [[3, 13621, 1476, 4936, 799, 4]]
predict input:  [[13621, 1476, 4936, 799, 4, 3]]
predict input:  [[1476, 4936, 799, 4, 3, 186]]
predict input:  [[4936, 799, 4, 3, 186, 9]]
predict input:  [[799, 4, 3, 186, 9, 14110]]
predict input:  [[4, 3, 186, 9, 14110, 25]]
predict input:  [[3, 186, 9, 14110, 25, 4]]
predict input:  [[186, 9, 14110, 25, 4, 3]]
predict input:  [

In [30]:
for _ in range(20):
    rnn_large_haiku = generate_haiku(rnn_model_large, tokenizer, [HAIKU_BEGIN] * (NGRAM_SIZE - 1), index_to_embedding, 30, is_rnn=True)
    print(rnn_large_haiku)

<h> <h> <h> <h> <h> <h> <s> leave for a newfound </s> </h>
<h> <h> <h> <h> <h> <h> <s> vegas k from a j </s> <s> around my eyes remain </s> <s> the verge sun pass </s> <s> gucci de from noise </s> <s> the natural to beat
<h> <h> <h> <h> <h> <h> <s> if it's haiku </s> <s> you can know you stared </s> <s> their all your gaze swims mist </s> <s> trust must a </s> </h>
<h> <h> <h> <h> <h> <h> <s> a ribbon hound </s> <s> only is eternal a black itch </s> <s> leaves souls nett </s> <s> ephemeral with shadowy </s> </h>
<h> <h> <h> <h> <h> <h> <s> a dead man bliss </s> <s> nobody special great up </s> <s> search pains true </s> <s> i see its see all </s> <s> haiku out and i simply
<h> <h> <h> <h> <h> <h> <s> a crisp eyes slowly stronger </s> </h>
<h> <h> <h> <h> <h> <h> <s> man your heart </s> <s> a good breath it is where feels </s> <s> you can cry them </s> <s> what a rotten </s> </h>
<h> <h> <h> <h> <h> <h> <s> these falls shakes of wind </s> <s> her bare lips racing </s> </h>
<h> <h> <h> <

In [None]:
for _ in range(20):
    greedy_ffnn = generate_haiku_greedy(feed_forward_model, tokenizer, index_to_embedding, word_to_syllable)
    print(greedy_ffnn)

In [31]:
for _ in range(20):
    greedy_rnn_small = generate_haiku_greedy(rnn_model_small, tokenizer, index_to_embedding, word_to_syllable, is_rnn=True)
    print(greedy_rnn_small)

<h> <h> <h> <h> <h> <h> <s> times all are told to </s> <s> the cat are wonderful on </s> <s> procrastination </s> </h> </h> </h> </h> </h> </h>
<h> <h> <h> <h> <h> <h> <s> an struggles face back </s> <s> to hold so goddess with my </s> <s> rhythms falls flying with </s> </h> </h> </h> </h> </h> </h>
<h> <h> <h> <h> <h> <h> <s> perennial air cool </s> <s> a lineage sound of the current </s> <s> came between cool through </s> </h> </h> </h> </h> </h> </h>
<h> <h> <h> <h> <h> <h> <s> off behind quiet the </s> <s> our doubt haze feathers of birds </s> <s> summer harsh upwards </s> </h> </h> </h> </h> </h> </h>
<h> <h> <h> <h> <h> <h> <s> rising stream to breathe </s> <s> dead she reflection of the </s> <s> internal has true </s> </h> </h> </h> </h> </h> </h>
<h> <h> <h> <h> <h> <h> <s> soda without catch </s> <s> superstitious and meaning </s> <s> water bombarding </s> </h> </h> </h> </h> </h> </h>
<h> <h> <h> <h> <h> <h> <s> happy ever bleed </s> <s> you knew miss us for the </s> <s> fami

In [None]:
for _ in range(20):
    greedy_rnn_large = generate_haiku_greedy(rnn_model_large, tokenizer, index_to_embedding, word_to_syllable, is_rnn=True)
    print(greedy_rnn_large)