# Train word embeddings

In [42]:
import numpy as np
from keras.preprocessing.text import Tokenizer
from keras.utils import to_categorical
from keras.utils import pad_sequences
from keras.models import Sequential
from keras.models import load_model
from keras.layers import Dense, LSTM, Bidirectional
from keras.optimizers import Adam
from keras.losses import CategoricalCrossentropy
from tensorflow.keras.losses import Reduction
from keras import Input
from gensim.models import Word2Vec
from gensim.models import KeyedVectors

In [None]:
# Constants

HAIKU_BEGIN = "<h>"
HAIKU_END = "</h>"
LINE_BEGIN = "<s>"
LINE_END = "</s>"

NGRAM_SIZE = 7
EMBEDDING_SIZE = 200

BATCH_SIZE = 128

INPUT_UNITS = (NGRAM_SIZE - 1) * EMBEDDING_SIZE

EPOCHS = 3

line_structure = {1 : 5,
                 2 : 7,
                 3 : 5}

In [41]:
word_to_syllable = {}

with open("data/phoneticDictionary.csv", 'r', encoding='utf_8') as f:
    f.readline()
    for line in f.readlines():
        cols = line.split(',')
        word_to_syllable[cols[1].strip("\"")] = int(cols[3])
print(len(word_to_syllable))

125927


In [3]:
# as per meeting w/ felix, training on whole haiku so it learns the structure
# results = lists of tokenized haiku, with poem and line separator tokens:
# [[<H>,<S>,stanza 1,</S>,<S>,stanza 2,</S>,<S>,stanza 3,</S>,</H>],...]

haiku_loc = "data/haiku_reddit.txt"
reddit_tokens = []
with open(haiku_loc, 'r', encoding='utf-8') as f:
  for line in f:
    tokens = []
    # remove trailing spaces and end-of-poem $/n marker
    stanzas = [s.strip(' $\n') for s in line.split("/")]
    tokens += [HAIKU_BEGIN] * (NGRAM_SIZE - 1)
    
    for stanza in stanzas:
      tokens.append(LINE_BEGIN)
      # whitespace split rather than NLTK tokenize because I don't know if the
      # syllable dictionary has entries for nonword NLTK tokens (eg 'll n't)
      tokens.extend(stanza.split())
      tokens.append(LINE_END)
      
    tokens += [HAIKU_END] * (NGRAM_SIZE - 1)
    reddit_tokens.append(tokens)

print(reddit_tokens[0:5])

[['<h>', '<h>', '<h>', '<h>', '<h>', '<h>', '<s>', 'delicate', 'savage', '</s>', '<s>', "you'll", 'never', 'hold', 'the', 'cinder', '</s>', '<s>', 'but', 'still', 'you', 'will', 'burn', '</s>', '</h>', '</h>', '</h>', '</h>', '</h>', '</h>'], ['<h>', '<h>', '<h>', '<h>', '<h>', '<h>', '<s>', 'our', 'destination', '</s>', '<s>', 'the', 'skyline', 'of', 'this', 'city', '</s>', '<s>', 'shining', 'horizon', '</s>', '</h>', '</h>', '</h>', '</h>', '</h>', '</h>'], ['<h>', '<h>', '<h>', '<h>', '<h>', '<h>', '<s>', 'a', 'splash', 'and', 'a', 'cry', '</s>', '<s>', 'words', 'pulled', 'from', 'the', 'riverside', '</s>', '<s>', 'dried', 'in', 'the', 'hot', 'sun', '</s>', '</h>', '</h>', '</h>', '</h>', '</h>', '</h>'], ['<h>', '<h>', '<h>', '<h>', '<h>', '<h>', '<s>', 'hurt', 'but', 'poised', 'for', 'war', '</s>', '<s>', 'sturdy', 'in', 'crestfallen', 'slumps', '</s>', '<s>', 'warrior', 'spirit', '</s>', '</h>', '</h>', '</h>', '</h>', '</h>', '</h>'], ['<h>', '<h>', '<h>', '<h>', '<h>', '<h>', '

Train embeddings

In [4]:
"""
Trains a word2vec model on the given sentences. Returns the trained word embeddings as a KeyedVectors object.
Function provided from HW4 starter code.
"""
def train_model(sentences, sg=1, window_size=5, vector_size=EMBEDDING_SIZE, min_count=1) :
  model = Word2Vec(sentences=sentences, vector_size=vector_size, window=window_size, min_count=min_count, sg=sg)
  return model.wv

reddit_haiku_embs = train_model(reddit_tokens)

In [5]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(reddit_tokens)
encoded = tokenizer.texts_to_sequences(reddit_tokens)

print(encoded[0:5])

[[1, 1, 1, 1, 1, 1, 3, 1444, 3133, 4, 3, 918, 68, 334, 5, 7333, 4, 3, 28, 62, 11, 33, 555, 4, 2, 2, 2, 2, 2, 2], [1, 1, 1, 1, 1, 1, 3, 57, 3134, 4, 3, 5, 2068, 12, 26, 451, 4, 3, 796, 615, 4, 2, 2, 2, 2, 2, 2], [1, 1, 1, 1, 1, 1, 3, 7, 2069, 13, 7, 437, 4, 3, 81, 1568, 36, 5, 7334, 4, 3, 2304, 10, 5, 274, 65, 4, 2, 2, 2, 2, 2, 2], [1, 1, 1, 1, 1, 1, 3, 518, 28, 4983, 16, 506, 4, 3, 3853, 10, 7335, 7336, 4, 3, 2305, 438, 4, 2, 2, 2, 2, 2, 2], [1, 1, 1, 1, 1, 1, 3, 3854, 1114, 538, 4, 3, 1005, 3855, 3135, 1275, 4, 3, 41, 5, 2070, 1569, 4, 2, 2, 2, 2, 2, 2]]


In [6]:
def generate_ngram_training_samples(encoded: list) -> list:
    '''
    Takes the encoded data (list of lists) and 
    generates the training samples out of it.
    Parameters:
    up to you, we've put in what we used
    but you can add/remove as needed
    return: 
    tuple of (training_x, training_y) in the format [[1, 2, 3], [2, 3, 2], ...] and [2, 4, ...]
    '''
    training_x = []
    training_y = []

    for sentence in encoded:
      for i in range(len(sentence) - NGRAM_SIZE + 1):
        training_x.append(sentence[i:i + NGRAM_SIZE - 1])
        training_y.append(sentence[i + NGRAM_SIZE - 1])

    return (training_x, training_y)

In [7]:
training_x, training_y = generate_ngram_training_samples(encoded)

print(training_x[0:5])
print(np.shape(training_x))
print(training_y[0:5])

[[1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 3], [1, 1, 1, 1, 3, 1444], [1, 1, 1, 3, 1444, 3133], [1, 1, 3, 1444, 3133, 4]]
(276808, 6)
[3, 1444, 3133, 4, 3]


In [8]:
def create_word_to_embedding(embs: KeyedVectors) -> dict:
    """
    Creates a mapping from each word in the embedding vocabulary to its embedding.
    """
    word_to_embedding = {}
    for word in embs.key_to_index.keys():
      word_to_embedding[word] = embs[word]
    return word_to_embedding

def create_index_to_embedding(embs: KeyedVectors, tokenizer: Tokenizer) -> dict:
  """
  Creates a mapping from the tokenizer index of each word in the embedding vocabulary to its embedding.
  """
  index_to_embedding = {}
  for word in embs.key_to_index.keys():
    index = tokenizer.word_index[word]
    index_to_embedding[index] = embs[word]
  return index_to_embedding

def get_word_to_index(word: str, tokenizer: Tokenizer):
  return tokenizer.texts_to_sequences([[word]])[0][0]

In [9]:
word_to_embedding = create_word_to_embedding(reddit_haiku_embs)
index_to_embedding = create_index_to_embedding(reddit_haiku_embs, tokenizer)
print(len(index_to_embedding))

14269


In [10]:
def data_generator(X: list, y: list, num_sequences_per_batch: int, i_to_emb: dict):
    '''
    Returns data generator to be used by feed_forward
    https://wiki.python.org/moin/Generators
    https://realpython.com/introduction-to-python-generators/
    
    Yields batches of embeddings and labels to go with them.
    Use one hot vectors to encode the labels 
    (see the to_categorical function)

    Requires a mapping to convert from tokenizer index to embedding vector.
    
    '''  
    embs = []
    labels = []
    i = 0
    while True:
        i = i % len(X)
        
        emb = [i_to_emb[n] for n in X[i]]  # [ [..200..], [..200..] ] list of lists, shape (n-1, embedding_size)
        embs.append(emb)  # list of list of lists, shape (batch_size, n-1, emb_size)
        # we want shape (batch_size, (n-1)*emb_size)

        # create one-hot vector with the 1 at the location of the tokenizer index
        # adding 1 to length to account for vector indices starting from 1 instead of 0
        label = to_categorical(y[i], num_classes=len(i_to_emb)+1)
        labels.append(label)
        if len(embs) % num_sequences_per_batch == 0:
            yield (np.reshape(embs, (num_sequences_per_batch, -1)), np.array(labels))
            embs = []
            labels = []

        i += 1

        
"""
sample = next(data_generator(rnn_training_x, rnn_training_y, 2, index_to_embedding))
sample = next(data_generator(training_x, training_y, 33, index_to_embedding))
print(sample)
print(np.shape(sample[0])) # batch_size, emb_size * n-1 -- (concatenated embeddings of n-1-word sample)
print(np.shape(sample[1])) # batch_size, len(index_to_embedding) -- (a one-hot vector for each nth word result)
"""


'\nsample = next(data_generator(rnn_training_x, rnn_training_y, 2, index_to_embedding))\nsample = next(data_generator(training_x, training_y, 33, index_to_embedding))\nprint(sample)\nprint(np.shape(sample[0])) # batch_size, emb_size * n-1 -- (concatenated embeddings of n-1-word sample)\nprint(np.shape(sample[1])) # batch_size, len(index_to_embedding) -- (a one-hot vector for each nth word result)\n'

In [11]:
train_generator = data_generator(training_x, training_y, BATCH_SIZE, index_to_embedding)

# Model 1: feedforward NN

In [12]:
def build_feed_forward_model(input_units, hidden_units, output_units):
  model = Sequential()
  
  model.add(Input(shape=(input_units,)))  # inputs will be vectors of this length, batch size not specified
  model.add(Dense(hidden_units, activation="softmax"))
  model.add(Dense(output_units, activation="softmax"))
  
  model.compile(optimizer=Adam(learning_rate=0.01), loss=CategoricalCrossentropy())
  return model

In [13]:
output_units = len(reddit_haiku_embs.key_to_index.keys()) + 1
hidden_units = 1000 #round((INPUT_UNITS + output_units) / 2)

feed_forward_model = build_feed_forward_model(INPUT_UNITS, hidden_units, output_units)


In [14]:
feed_forward_model.fit(x=train_generator, epochs=EPOCHS, steps_per_epoch=len(training_x) // BATCH_SIZE)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x1fe25f70c70>

In [33]:
feed_forward_model.save("ffnn_model_trained")

INFO:tensorflow:Assets written to: ffnn_model_trained\assets


INFO:tensorflow:Assets written to: ffnn_model_trained\assets


# Model 2: RNN

In [15]:
# lstm produces 1 label per timestep, so timestep = n-1-gram 
# timestep size = NGRAM_SIZE - 1 (4)
# timestep consists of n-1 word embeddings, each with some features
# features = EMBEDDING_SIZE (200)
# then we can do this for a certain number of batches, lets say 128 still
# the inputs to the LSTM need to have shape (batch_size, timesteps, features)
# so for us that means shape (BATCH_SIZE, NGRAM_SIZE-1, EMBEDDING_SIZE)

# upshot: have to make a new data generator, sicne the FFNN one squished all the embeddings together

# timestep = ngram length is supported by several example articles: 
# http://ethen8181.github.io/machine-learning/keras/rnn_language_model_basic_keras.html

In [16]:
def rnn_data_generator(X: list, y: list, batch_size: int, i_to_emb: dict):
    '''
    Produces a data generator for an RNN.
    Output data is of shape (batch_size, len(X[0]), len(i_to_emb.values()[1])
    i.e. (batch_size, ngram_size - 1, embedding_size)
    Output labels are one-hot vectors of shape (batch_size, len(i_to_emb.keys())+1)
    i.e. (batch_size, vocab_size) 
    '''  
    embs = []
    labels = []
    i = 0
    while True:
        i = i % len(X)
        
        emb = [i_to_emb[n] for n in X[i]]  # [ [..200..], [..200..] ] list of lists, shape (n-1, embedding_size)
        embs.append(emb)  # list of list of lists, shape (batch_size, n-1, emb_size)

        # create one-hot vector with the 1 at the location of the tokenizer index
        # adding 1 to length to account for vector indices starting from 1 instead of 0
        label = to_categorical(y[i], num_classes=len(i_to_emb)+1)
        labels.append(label)
        if len(embs) % batch_size == 0:
            #yield (np.array(embs), np.reshape(labels, (batch_size, len(i_to_emb)+1, 1)))
            yield (np.array(embs), np.array(labels))
            embs = []
            labels = []

        i += 1

rnn_training_generator = rnn_data_generator(training_x, training_y, BATCH_SIZE, index_to_embedding)
sample = next(rnn_training_generator)
print(np.shape(sample[0]))
print(np.shape(sample[1]))

(128, 6, 200)
(128, 14270)


In [59]:
def build_rnn_model(timestep_size, input_units, hidden_units, output_units):
    model = Sequential()
    
    # input size needs to be a tuple of (timesteps, features), 
    # per https://towardsdatascience.com/a-practical-guide-to-rnn-and-lstm-in-keras-980f176271bc
    model.add(Input(shape=(timestep_size, input_units)))  # (4, 200)
    model.add(LSTM(hidden_units))
    model.add(Dense(output_units, activation="softmax"))
    
    model.compile(optimizer=Adam(learning_rate=0.01), loss=CategoricalCrossentropy())
    return model

In [62]:
hidden_units = 500 # a lot of the examples used 128
output_units = len(reddit_haiku_embs.key_to_index.keys()) + 1

rnn_model = build_rnn_model(NGRAM_SIZE - 1, EMBEDDING_SIZE, hidden_units, output_units)
print(rnn_model.summary())

Model: "sequential_6"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm_7 (LSTM)               (None, 500)               1402000   
                                                                 
 dense_7 (Dense)             (None, 14270)             7149270   
                                                                 
Total params: 8,551,270
Trainable params: 8,551,270
Non-trainable params: 0
_________________________________________________________________
None


In [45]:
#y = rnn_model(next(rnn_training_generator)[0])
#print(y)  # output of model is (batch_size, vocab_size), i.e. a one-hot vector for each timestep
#print(rnn_model.summary())

In [63]:
rnn_model.fit(x=rnn_training_generator, epochs=EPOCHS, steps_per_epoch=len(training_x) // BATCH_SIZE)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x1fe679f32b0>

In [66]:
#rnn_model.save("rnn_model_trained")  # NGRAM_SIZE = 7, 128 hidden units
rnn_model.save("rnn_model_large_trained") # NGRAM_SIZE = 7, 500 hidden units



INFO:tensorflow:Assets written to: rnn_model_large_trained\assets


INFO:tensorflow:Assets written to: rnn_model_large_trained\assets


In [21]:
# an earlier attempt, treating an entire haiku as a sequence, padding them all to the same length, 
# using each ngram as a timestep, and using the concatenated 800-unit embeddings for each ngram

In [22]:
# make all haiku token sequences the same length/# of timesteps
# by padding the beginning with placeholder tokens (beginning so no other tokens lead to the placeholder)
# doing this to the encoded version bc keras's built-in function deals with integers, not strings
rnn_encoded = pad_sequences(encoded, maxlen=MAX_TOKENS)
#print(rnn_encoded[:5])

rnn_training_x, rnn_training_y = generate_ngram_training_samples(lstm_encoded)
print(rnn_training_x[11:16])
print(np.shape(rnn_training_x))
print(rnn_training_y[11:16])

NameError: name 'MAX_TOKENS' is not defined

In [None]:
def rnn_data_generator(X, y, batch_size, timesteps, i_to_emb):
    """
    Generates data suitable for an RNN (in timestep groups).
    Produces a tuple of training samples, shape=(batch_size, timesteps, INPUT_SIZE)
    and their accompanying labels, shape=(batch_size, timesteps, VOCAB_SIZE)
    """
    dg = data_generator(X, y, timesteps, i_to_emb)  # use ffnn data generator to get timestep blocks
    i = 0
    xs = []
    ys = []
    while True:
        i = i % len(X)
        next_x, next_y = next(dg)  # (TIMESTEPS, INPUT_SIZE), (TIMESTEPS, VOCAB_SIZE)
        xs.append(next_x)
        ys.append(next_y)
        if len(xs) % batch_size == 0:
            yield (np.array(xs), np.array(ys)) # ((BATCH_SIZE, TIMESTEPS, INPUT_SIZE), (BATCH_SIZE, TIMESTEPS, VOCAB_SIZE))
            xs = []
            ys = []

        i += 1

rnn_train_generator = rnn_data_generator(rnn_training_x, rnn_training_y, 1, TIMESTEPS, index_to_embedding)
sample = next(rnn_train_generator)
print(np.shape(sample[0]))
print(sample[0])
print(np.shape(sample[1]))
print(sample[1])

In [None]:
def build_rnn_model(input_units, hidden_units, output_units):
    model = Sequential()
    
    # input size needs to be a tuple of (timesteps, features), 
    # per https://towardsdatascience.com/a-practical-guide-to-rnn-and-lstm-in-keras-980f176271bc
    model.add(Input(shape=(TIMESTEPS, input_units)))
    model.add(LSTM(hidden_units)) #, input_shape=(TIMESTEPS, input_units)))
    model.add(Dense(output_units, activation="softmax"))
    
    model.compile(optimizer=Adam(learning_rate=0.01), loss=CategoricalCrossentropy(axis=2)) # trying axis 2 to see if that fixes the error
    return model


In [None]:
output_units = len(reddit_haiku_embs.key_to_index.keys()) + 1
hidden_units = 128  # lots of examples used 128 lstm units so I'm going with that for now

rnn_model = build_rnn_model(INPUT_UNITS, hidden_units, output_units)
rnn_model.summary()

In [None]:
rnn_model.fit(x=rnn_train_generator, epochs=1, steps_per_epoch=len(rnn_training_x) // (1 * TIMESTEPS))

# Generate Haikus

In [47]:
# load saved models
#feed_forward_model = load_model("ffnn_model_trained")
rnn_model_1 = load_model("rnn_model_trained")

In [75]:
def predict_data_generator(X: list, num_sequences_per_batch: int, i_to_emb: dict, is_rnn=False) -> list:
    '''
    Returns data generator to be used for prediction data
    
    Yields batches of embeddings to go with them.
    Use one hot vectors to encode the labels 
    (see the to_categorical function)

    Requires a mapping to convert from tokenizer index to embedding vector.
    
    '''  
    embs = []
    for i in range(len(X)):
        emb = [i_to_emb[n] for n in X[i]]  # [ [..200..], [..200..] ] list of lists, shape (n-1, embedding_size)
        embs.append(emb)  # list of list of lists, shape (batch_size, n-1, emb_size)
        # we want shape (batch_size, (n-1)*emb_size)

        # create one-hot vector with the 1 at the location of the tokenizer index
        if len(embs) % num_sequences_per_batch == 0:
            if is_rnn:
                yield np.array(embs)
            else:
                yield np.reshape(embs, (num_sequences_per_batch, -1))
            embs = []


def generate_haiku(model: Sequential, 
                 tokenizer: Tokenizer, 
                 seed: list,
                 i_to_emb: dict,
                 n_words: int,
                  is_rnn=False):
    '''
    Generate a haiku from the given model
    
    Parameters:
        model: your neural network
        tokenizer: the keras preprocessing tokenizer
        seed: [w1, w2, w(n-1)]
        n_words: generate a sentence of length n_words
    Returns: string sentence
    '''
    sentence = seed
    sentence_indices = tokenizer.texts_to_sequences([seed])[0]

    # make the input list for the model.predict
    # format is the n_grams so [[1, 2], [2, 3], [3, 4] ...]
    predict_input = []
    for i in range(len(sentence_indices) - NGRAM_SIZE + 2):
        predict_input += [sentence_indices[i:i + NGRAM_SIZE]]
    
    
    n_words_generated = 0
    while n_words_generated < n_words:
        #print("predict input: ", predict_input)
        if is_rnn:
            gen = predict_data_generator(predict_input, len(predict_input), i_to_emb, is_rnn=True)
        else:
            gen = predict_data_generator(predict_input, len(predict_input), i_to_emb)
        probabilities = model.predict(x=gen, verbose=None)[0]

        all_word_counts = [i for i in range(len(i_to_emb.keys()) + 1)]
        sampled_index = np.random.choice(all_word_counts, p=probabilities)
        new_word = tokenizer.sequences_to_texts([[sampled_index]])[0]
      
        sentence.append(new_word)
        sentence_indices.append(sampled_index)
        predict_input.append(sentence_indices[-(NGRAM_SIZE - 1):])
        predict_input = predict_input[1:]
      
        if sentence[-1] == HAIKU_END:
            break
      
        n_words_generated += 1
    

    return " ".join(sentence)


In [None]:
def get_syllables(sentence: list, syllable_dictionary: dict) -> int:
    '''
    Counts the number of syllables in the given sentence.
    Meta-tokens like line begin/end do not affect syllable count.
    '''
    
    count = 0
    for word in sentence:
        if word in [HAIKU_BEGIN, HAIKU_END, LINE_BEGIN, LINE_END]:
            count += 0
        else:
            count += syllable_dictionary[word]
    
    return count

def generate_haiku_greedy(model: Sequential, tokenizer: Tokenizer, seed=None: list, i_to_emb: dict, is_rnn=False):
    '''
    Generates a haiku from the model, ensuring a syllable fit using a greedy algorithm.
    
    Seed (optional) should be a list of tokens of length NGRAM_SIZE - 1 for the model to predict from.
    If not specified, NGRAM_SIZE-1 haiku begin tokens will be used.
    '''
    if seed is None:
        sentence = [HAIKU_BEGIN] * (NGRAM_SIZE - 1)
    else:
        sentence = seed
    
    sentence_indices = tokenizer.texts_to_sequences([sentence])[0]
    
    current_line = 1
    
    haiku = []
    while True:
        sentence_embs = [i_to_emb[i] for i in sentence_indices]
        current_syllables = get_syllables(sentence)
        
        probabilities = model.predict(x=seed_embs, verbose=None)[0]
        
        all_word_indices = [i for i in range(len(i_to_emb.keys()) + 2)]
        sampled_index = np.random.choice(all_word_counts, p=probabilities)
        new_word = tokenizer.sequences_to_texts([[sampled_index]])[0]
        
        if current_syllables + get_syllables([new_word]) == line_structure(current_line):
            sentence.append(new_word)
            sentence.append(LINE_END)
        
    
                          

In [76]:
#haiku = generate_haiku(feed_forward_model, tokenizer, [HAIKU_BEGIN] * (NGRAM_SIZE - 1), index_to_embedding, 30)

rnn_large_haiku = generate_haiku(rnn_model, tokenizer, [HAIKU_BEGIN] * (NGRAM_SIZE - 1), index_to_embedding, 30, is_rnn=True)
print(rnn_large_haiku)
rnn_haiku = generate_haiku(rnn_model_1, tokenizer, [HAIKU_BEGIN] * (NGRAM_SIZE - 1), index_to_embedding, 30, is_rnn=True)
print(rnn_haiku)

<h> <h> <h> <h> <h> <h> <s> planes </s> <s> a half new of moonlight </s> <s> perhaps those pull sad </s> </h>
<h> <h> <h> <h> <h> <h> <s> prick colors </s> <s> this become look for us </s> </h>


In [57]:
print(rnn_2_haiku)
print(rnn_haiku)

<h> <h> <h> <h> <h> <h> </s> </s> </s> </s> <s> is i meet who the sleep the by the at while the haiku the paint the remains the very the sand the times the the
<h> <h> <h> <h> <h> <h> </s> </s> </s> </s> <s> maybe guilty a page the hands the than the heart the little made the way the the steps the mine the lord the while made
