# Train word embeddings

In [80]:
import numpy as np
from keras.preprocessing.text import Tokenizer
from keras.utils import to_categorical
from keras.models import Sequential
from keras.layers import Dense, LSTM
from keras.optimizers import Adam
from keras.losses import CategoricalCrossentropy
from keras import Input
from gensim.models import Word2Vec
from gensim.models import KeyedVectors

In [81]:
# Constants

HAIKU_BEGIN = "<h>"
HAIKU_END = "</h>"
LINE_BEGIN = "<s>"
LINE_END = "</s>"

NGRAM_SIZE = 5
EMBEDDING_SIZE = 200

BATCH_SIZE = 128

INPUT_UNITS = (NGRAM_SIZE - 1) * EMBEDDING_SIZE

EPOCHS = 3


In [82]:
# as per meeting w/ felix, training on whole haiku so it learns the structure
# results = lists of tokenized haiku, with poem and line separator tokens:
# [[<H>,<S>,stanza 1,</S>,<S>,stanza 2,</S>,<S>,stanza 3,</S>,</H>],...]

haiku_loc = "data/haiku_reddit.txt"
reddit_tokens = []
with open(haiku_loc, 'r', encoding='utf-8') as f:
  for line in f:
    tokens = []
    # remove trailing spaces and end-of-poem $/n marker
    stanzas = [s.strip(' $\n') for s in line.split("/")]
    tokens += [HAIKU_BEGIN] * (NGRAM_SIZE - 1)
    
    for stanza in stanzas:
      tokens.append(LINE_BEGIN)
      # whitespace split rather than NLTK tokenize because I don't know if the
      # syllable dictionary has entries for nonword NLTK tokens (eg 'll n't)
      tokens.extend(stanza.split())
      tokens.append(LINE_END)
      
    tokens += [HAIKU_END] * (NGRAM_SIZE - 1)
    reddit_tokens.append(tokens)

print(reddit_tokens[0:5])

[['<h>', '<h>', '<h>', '<h>', '<s>', 'delicate', 'savage', '</s>', '<s>', "you'll", 'never', 'hold', 'the', 'cinder', '</s>', '<s>', 'but', 'still', 'you', 'will', 'burn', '</s>', '</h>', '</h>', '</h>', '</h>'], ['<h>', '<h>', '<h>', '<h>', '<s>', 'our', 'destination', '</s>', '<s>', 'the', 'skyline', 'of', 'this', 'city', '</s>', '<s>', 'shining', 'horizon', '</s>', '</h>', '</h>', '</h>', '</h>'], ['<h>', '<h>', '<h>', '<h>', '<s>', 'a', 'splash', 'and', 'a', 'cry', '</s>', '<s>', 'words', 'pulled', 'from', 'the', 'riverside', '</s>', '<s>', 'dried', 'in', 'the', 'hot', 'sun', '</s>', '</h>', '</h>', '</h>', '</h>'], ['<h>', '<h>', '<h>', '<h>', '<s>', 'hurt', 'but', 'poised', 'for', 'war', '</s>', '<s>', 'sturdy', 'in', 'crestfallen', 'slumps', '</s>', '<s>', 'warrior', 'spirit', '</s>', '</h>', '</h>', '</h>', '</h>'], ['<h>', '<h>', '<h>', '<h>', '<s>', 'steamy', 'mist', 'rising', '</s>', '<s>', 'rocks', 'receiving', 'downward', 'crash', '</s>', '<s>', 'as', 'the', 'jungle', 'wee

Train embeddings

In [83]:
"""
Trains a word2vec model on the given sentences. Returns the trained word embeddings as a KeyedVectors object.
Function provided from HW4 starter code.
"""
def train_model(sentences, sg=1, window_size=5, vector_size=EMBEDDING_SIZE, min_count=1) :
  model = Word2Vec(sentences=sentences, vector_size=vector_size, window=window_size, min_count=min_count, sg=sg)
  return model.wv

reddit_haiku_embs = train_model(reddit_tokens)

In [84]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(reddit_tokens)
encoded = tokenizer.texts_to_sequences(reddit_tokens)

print(encoded[0:5])

[[1, 1, 1, 1, 3, 1444, 3133, 4, 3, 918, 68, 334, 5, 7333, 4, 3, 28, 62, 11, 33, 555, 4, 2, 2, 2, 2], [1, 1, 1, 1, 3, 57, 3134, 4, 3, 5, 2068, 12, 26, 451, 4, 3, 796, 615, 4, 2, 2, 2, 2], [1, 1, 1, 1, 3, 7, 2069, 13, 7, 437, 4, 3, 81, 1568, 36, 5, 7334, 4, 3, 2304, 10, 5, 274, 65, 4, 2, 2, 2, 2], [1, 1, 1, 1, 3, 518, 28, 4983, 16, 506, 4, 3, 3853, 10, 7335, 7336, 4, 3, 2305, 438, 4, 2, 2, 2, 2], [1, 1, 1, 1, 3, 3854, 1114, 538, 4, 3, 1005, 3855, 3135, 1275, 4, 3, 41, 5, 2070, 1569, 4, 2, 2, 2, 2]]


In [85]:
def generate_ngram_training_samples(encoded: list) -> list:
    '''
    Takes the encoded data (list of lists) and 
    generates the training samples out of it.
    Parameters:
    up to you, we've put in what we used
    but you can add/remove as needed
    return: 
    tuple of (training_x, training_y) in the format [[1, 2, 3], [2, 3, 2], ...] and [2, 4, ...]
    '''
    training_x = []
    training_y = []

    for sentence in encoded:
      for i in range(len(sentence) - NGRAM_SIZE + 1):
        training_x.append(sentence[i:i + NGRAM_SIZE - 1])
        training_y.append(sentence[i + NGRAM_SIZE - 1])

    return (training_x, training_y)

In [86]:
training_x, training_y = generate_ngram_training_samples(encoded)

print(training_x[0:5])
print(training_y[0:5])

[[1, 1, 1, 1], [1, 1, 1, 3], [1, 1, 3, 1444], [1, 3, 1444, 3133], [3, 1444, 3133, 4]]
[3, 1444, 3133, 4, 3]


In [87]:
def create_word_to_embedding(embs: KeyedVectors) -> dict:
    """
    Creates a mapping from each word in the embedding vocabulary to its embedding.
    """
    word_to_embedding = {}
    for word in embs.key_to_index.keys():
      word_to_embedding[word] = embs[word]
    return word_to_embedding

def create_index_to_embedding(embs: KeyedVectors, tokenizer: Tokenizer) -> dict:
  """
  Creates a mapping from the tokenizer index of each word in the embedding vocabulary to its embedding.
  """
  index_to_embedding = {}
  for word in embs.key_to_index.keys():
    index = tokenizer.word_index[word]
    index_to_embedding[index] = embs[word]
  return index_to_embedding

def get_word_to_index(word: str, tokenizer: Tokenizer):
  return tokenizer.texts_to_sequences([[word]])[0][0]

In [88]:
word_to_embedding = create_word_to_embedding(reddit_haiku_embs)
index_to_embedding = create_index_to_embedding(reddit_haiku_embs, tokenizer)

In [89]:
def data_generator(X: list, y: list, num_sequences_per_batch: int, i_to_emb: dict):
    '''
    Returns data generator to be used by feed_forward
    https://wiki.python.org/moin/Generators
    https://realpython.com/introduction-to-python-generators/
    
    Yields batches of embeddings and labels to go with them.
    Use one hot vectors to encode the labels 
    (see the to_categorical function)

    Requires a mapping to convert from tokenizer index to embedding vector.
    
    '''  
    embs = []
    labels = []
    i = 0
    while True:
      i = i % len(X)

      emb = [i_to_emb[n] for n in X[i]]  # [ [..200..], [..200..] ] list of lists, shape (n-1, embedding_size)
      embs.append(emb)  # list of list of lists, shape (batch_size, n-1, emb_size)
      # we want shape (batch_size, (n-1)*emb_size)

      # create one-hot vector with the 1 at the location of the tokenizer index
      # adding one to number fo classes to account for i_to_emb not containing 0
      label = to_categorical(y[i], num_classes=len(i_to_emb)+1)
      labels.append(label)
      if len(embs) % num_sequences_per_batch == 0:
        yield (np.reshape(embs, (num_sequences_per_batch, -1)), np.array(labels))
        embs = []
        labels = []
      
      i += 1
      

# sample = next(data_generator(training_x[:3], training_y[:3], 2, index_to_embedding))
# print(sample)

In [90]:
train_generator = data_generator(training_x, training_y, BATCH_SIZE, index_to_embedding)

# Model 1: feedforward NN

In [91]:
def build_feed_forward_model(input_units, hidden_units, output_units):
  model = Sequential()
  
  model.add(Input(shape=(input_units,)))  # inputs will be vectors of this length, batch size not specified
  model.add(Dense(hidden_units, activation="softmax"))
  model.add(Dense(output_units, activation="softmax"))
  
  model.compile(optimizer=Adam(learning_rate=0.01), loss=CategoricalCrossentropy())
  return model

# Model 2: RNN

In [92]:
def build_rnn_model(input_units, hidden_units, output_units):
    model = Sequential()
    
    print(input_units)
    
    # model.add(Input(shape=(input_units,))
    model.add(LSTM(128, input_shape=(1, input_units)))
    model.add(Dense(output_units, activation="softmax"))
    
    model.compile(optimizer=Adam(learning_rate=0.01), loss=CategoricalCrossentropy())


In [93]:
output_units = len(reddit_haiku_embs.key_to_index.keys()) + 1
hidden_units = 1000 #round((INPUT_UNITS + output_units) / 2)

feed_forward_model = build_feed_forward_model(INPUT_UNITS, hidden_units, output_units)
# rnn_model = build_rnn_model(INPUT_UNITS, hidden_units, output_units)


In [94]:
feed_forward_model.fit(x=train_generator, epochs=EPOCHS, steps_per_epoch=len(training_x) // BATCH_SIZE)

Epoch 1/3


2023-04-15 23:49:20.390519: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_0' with dtype int32
	 [[{{node Placeholder/_0}}]]


Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x13e2a2cd0>

# Generate Haikus

In [95]:
def predict_data_generator(X: list, num_sequences_per_batch: int, i_to_emb: dict) -> list:
    '''
    Returns data generator to be used for prediction data
    
    Yields batches of embeddings to go with them.
    Use one hot vectors to encode the labels 
    (see the to_categorical function)

    Requires a mapping to convert from tokenizer index to embedding vector.
    
    '''  
    embs = []
    for i in range(len(X)):
      emb = [i_to_emb[n] for n in X[i]]  # [ [..200..], [..200..] ] list of lists, shape (n-1, embedding_size)
      embs.append(emb)  # list of list of lists, shape (batch_size, n-1, emb_size)
      # we want shape (batch_size, (n-1)*emb_size)

      # create one-hot vector with the 1 at the location of the tokenizer index
      if len(embs) % num_sequences_per_batch == 0:
        yield np.reshape(embs, (num_sequences_per_batch, -1))
        embs = []


def generate_haiku(model: Sequential, 
                 tokenizer: Tokenizer, 
                 seed: list,
                 i_to_emb: dict,
                 n_words: int):
    '''
    Generate a haiku from the given model
    
    Parameters:
        model: your neural network
        tokenizer: the keras preprocessing tokenizer
        seed: [w1, w2, w(n-1)]
        n_words: generate a sentence of length n_words
    Returns: string sentence
    '''
    sentence = seed
    sentence_indices = tokenizer.texts_to_sequences([seed])[0]

    # make the input list for the model.predict
    # format is the n_grams so [[1, 2], [2, 3], [3, 4] ...]
    predict_input = []
    for i in range(len(sentence_indices) - NGRAM_SIZE + 2):
      predict_input += [sentence_indices[i:i + NGRAM_SIZE]]
    
    n_words_generated = 0
    while n_words_generated < n_words:
      probabilities = model.predict(x=predict_data_generator(predict_input, len(predict_input), i_to_emb), verbose=None)[0]

      all_word_counts = [i for i in range(1, len(i_to_emb.keys()) + 2)]
      sampled_index = np.random.choice(all_word_counts, p=probabilities)
      new_word = tokenizer.sequences_to_texts([[sampled_index]])[0]
      
      sentence.append(new_word)
      sentence_indices.append(sampled_index)
      predict_input.append(sentence_indices[-(NGRAM_SIZE - 1):])
      
      if sentence[-1] == HAIKU_END:
        break
      
      n_words_generated += 1
    

    return " ".join(sentence)



In [100]:
haiku = generate_haiku(feed_forward_model, tokenizer, [HAIKU_BEGIN] * (NGRAM_SIZE - 1), index_to_embedding, 30)



2023-04-16 00:02:53.496762: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_0' with dtype int32
	 [[{{node Placeholder/_0}}]]
2023-04-16 00:02:53.577149: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_0' with dtype int32
	 [[{{node Placeholder/_0}}]]
2023-04-16 00:02:53.655096: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_0' with dtype int32
	 [[{{node Placeholder/_0}}]]
2023-04-16 00:02:53.74

In [101]:
print(haiku)

<h> <h> <h> <h> <s> your failed </s> head rush choose closed </s> the some i aside <s> </s> the <s> flickers he with the the tired fleeting the free rise <s> summer don't
