# Train word embeddings

In [1]:
import numpy as np
import keras
from keras.preprocessing.text import Tokenizer
from keras.utils import to_categorical
from keras.models import Sequential
from keras.layers import Dense
from keras import Input
from gensim.models import Word2Vec
from gensim.models import KeyedVectors

2023-04-14 10:19:19.414343: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


Load and clean data

In [2]:
HAIKU_BEGIN = "<h>"
HAIKU_END = "</h>"
LINE_BEGIN = "<s>"
LINE_END = "</s>"

NGRAM_SIZE = 6

BATCH_SIZE = 128


EMBEDDING_SIZE = 200

In [None]:
# as per meeting w/ felix, training on whole haiku so it learns the structure
# results = lists of tokenized haiku, with poem and line separator tokens:
# [[<H>,<S>,stanza 1,</S>,<S>,stanza 2,</S>,<S>,stanza 3,</S>,</H>],...]

haiku_loc = "data/haiku_reddit.txt"
reddit_tokens = []
with open(haiku_loc, 'r', encoding='utf-8') as f:
  for line in f:
    tokens = []
    # remove trailing spaces and end-of-poem $/n marker
    stanzas = [s.strip(' $\n') for s in line.split("/")]
    tokens.append(HAIKU_BEGIN)
    for stanza in stanzas:
      tokens.append(LINE_BEGIN)
      # whitespace split rather than NLTK tokenize because I don't know if the
      # syllable dictionary has entries for nonword NLTK tokens (eg 'll n't)
      tokens.extend(stanza.split())
      tokens.append(LINE_END)
    tokens.append(HAIKU_END)
    reddit_tokens.append(tokens)

print(reddit_tokens[0:5])

[['<h>', '<s>', 'delicate', 'savage', '</s>', '<s>', "you'll", 'never', 'hold', 'the', 'cinder', '</s>', '<s>', 'but', 'still', 'you', 'will', 'burn', '</s>', '</h>'], ['<h>', '<s>', 'our', 'destination', '</s>', '<s>', 'the', 'skyline', 'of', 'this', 'city', '</s>', '<s>', 'shining', 'horizon', '</s>', '</h>'], ['<h>', '<s>', 'a', 'splash', 'and', 'a', 'cry', '</s>', '<s>', 'words', 'pulled', 'from', 'the', 'riverside', '</s>', '<s>', 'dried', 'in', 'the', 'hot', 'sun', '</s>', '</h>'], ['<h>', '<s>', 'hurt', 'but', 'poised', 'for', 'war', '</s>', '<s>', 'sturdy', 'in', 'crestfallen', 'slumps', '</s>', '<s>', 'warrior', 'spirit', '</s>', '</h>'], ['<h>', '<s>', 'steamy', 'mist', 'rising', '</s>', '<s>', 'rocks', 'receiving', 'downward', 'crash', '</s>', '<s>', 'as', 'the', 'jungle', 'weeps', '</s>', '</h>']]


Train embeddings

In [None]:
"""
Trains a word2vec model on the given sentences. Returns the trained word embeddings as a KeyedVectors object.
Function provided from HW4 starter code.
"""
def train_model(sentences, sg=1, window_size=5, vector_size=EMBEDDING_SIZE, min_count=1) :
  model = Word2Vec(sentences=sentences, vector_size=vector_size, window=window_size, min_count=min_count, sg=sg)
  return model.wv

reddit_haiku_embs = train_model(reddit_tokens)

In [None]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(reddit_tokens)
encoded = tokenizer.texts_to_sequences(reddit_tokens)

print(encoded[0:5])

[[3, 1, 1444, 3133, 2, 1, 918, 68, 334, 5, 7333, 2, 1, 28, 62, 11, 33, 555, 2, 4], [3, 1, 57, 3134, 2, 1, 5, 2068, 12, 26, 451, 2, 1, 796, 615, 2, 4], [3, 1, 7, 2069, 13, 7, 437, 2, 1, 81, 1568, 36, 5, 7334, 2, 1, 2304, 10, 5, 274, 65, 2, 4], [3, 1, 518, 28, 4983, 16, 506, 2, 1, 3853, 10, 7335, 7336, 2, 1, 2305, 438, 2, 4], [3, 1, 3854, 1114, 538, 2, 1, 1005, 3855, 3135, 1275, 2, 1, 41, 5, 2070, 1569, 2, 4]]


In [None]:
def generate_ngram_training_samples(encoded: list) -> list:
    '''
    Takes the encoded data (list of lists) and 
    generates the training samples out of it.
    Parameters:
    up to you, we've put in what we used
    but you can add/remove as needed
    return: 
    tuple of (training_x, training_y) in the format [[1, 2, 3], [2, 3, 2], ...] and [2, 4, ...]
    '''
    training_x = []
    training_y = []

    for sentence in encoded:
      for i in range(len(sentence) - NGRAM_SIZE + 1):
        training_x.append(sentence[i:i + NGRAM_SIZE - 1])
        training_y.append(sentence[i + NGRAM_SIZE - 1])

    return (training_x, training_y)

In [None]:
training_x, training_y = generate_ngram_training_samples(encoded)

print(training_x[0:5])
print(training_y[0:5])

[[3, 1, 1444, 3133, 2], [1, 1444, 3133, 2, 1], [1444, 3133, 2, 1, 918], [3133, 2, 1, 918, 68], [2, 1, 918, 68, 334]]
[1, 918, 68, 334, 5]


In [None]:
def create_word_to_embedding(embs: KeyedVectors) -> dict:
    """
    Creates a mapping from each word in the embedding vocabulary to its embedding.
    """
    word_to_embedding = {}
    for word in embs.key_to_index.keys():
      word_to_embedding[word] = embs[word]
    return word_to_embedding

def create_index_to_embedding(embs: KeyedVectors, tokenizer: Tokenizer) -> dict:
  """
  Creates a mapping from the tokenizer index of each word in the embedding vocabulary to its embedding.
  """
  index_to_embedding = {}
  for word in embs.key_to_index.keys():
    index = tokenizer.word_index[word]
    index_to_embedding[index] = embs[word]
  return index_to_embedding

def get_word_to_index(word: str, tokenizer: Tokenizer):
  return tokenizer.texts_to_sequences([[word]])[0][0]

In [None]:
word_to_embedding = create_word_to_embedding(reddit_haiku_embs)
index_to_embedding = create_index_to_embedding(reddit_haiku_embs, tokenizer)

In [None]:
def data_generator(X: list, y: list, num_sequences_per_batch: int, i_to_emb: dict):
    '''
    Returns data generator to be used by feed_forward
    https://wiki.python.org/moin/Generators
    https://realpython.com/introduction-to-python-generators/
    
    Yields batches of embeddings and labels to go with them.
    Use one hot vectors to encode the labels 
    (see the to_categorical function)

    Requires a mapping to convert from tokenizer index to embedding vector.
    
    '''  
    embs = []
    labels = []
    i = 0
    while True:
      i = i % len(X)

      emb = [i_to_emb[n] for n in X[i]]  # [ [..200..], [..200..] ] list of lists, shape (n-1, embedding_size)
      embs.append(emb)  # list of list of lists, shape (batch_size, n-1, emb_size)
      # we want shape (batch_size, (n-1)*emb_size)

      # create one-hot vector with the 1 at the location of the tokenizer index
      # adding one to number fo classes to account for i_to_emb not containing 0
      label = to_categorical(y[i], num_classes=len(i_to_emb)+1)
      labels.append(label)
      if len(embs) % num_sequences_per_batch == 0:
        yield (np.reshape(embs, (num_sequences_per_batch, -1)), np.array(labels))
        embs = []
        labels = []
      
      i += 1
      

# sample = next(data_generator(training_x[:3], training_y[:3], 2, index_to_embedding))
# print(sample)

In [None]:
train_generator = data_generator(training_x, training_y, BATCH_SIZE, index_to_embedding)

# Model 1: feedforward NN

In [None]:
def build_feedforward_model(input_units, hidden_units, output_units):
  model = Sequential()
  model.add(Input(shape=(input_units,)))  # inputs will be vectors of this length, batch size not specified
  model.add(Dense(hidden_units, activation="softmax"))
  model.add(Dense(output_units, activation="softmax"))
  model.compile(optimizer=keras.optimizers.Adam(learning_rate=0.01), 
                loss=keras.losses.CategoricalCrossentropy())
  return model

# Model 2: RNN