Homework 5: Neural Language Models  (& 🎃 SpOoKy 👻 authors 🧟 data) - Task 3
---

Task 3: Feedforward Neural Language Model (60 points)
--------------------------

For this task, you will create and train neural LMs for both your word-based embeddings and your character-based ones. You should write functions when appropriate to avoid excessive copy+pasting.

### a) First, encode  your text into integers (5 points)

In [47]:
# Importing utility functions from Keras
!pip install tensorflow
import keras
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import to_categorical

# necessary
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Dense, InputLayer, Flatten
from tensorflow.keras.optimizers import Adam

# optional
# from keras.layers import Dropout

# if you want fancy progress bars
from tqdm import notebook
from IPython.display import display

# your other imports here
import time
import neurallm_utils as nutils
import numpy as np
import pandas as pd
import random




In [9]:
# load in necessary data
TRAIN_FILE = 'spooky_author_train.csv'
spooky_author_data = pd.read_csv(TRAIN_FILE)
spooky_text_data = spooky_author_data['text']

In [10]:
# constants you may find helpful. Edit as you would like.
EMBEDDINGS_SIZE = 50
NGRAM = 3 # The ngram language model you want to train

In [11]:
# Initialize a Tokenizer and fit on your data
# do this for both the word and character data

# It is used to vectorize a text corpus. Here, it just creates a mapping from 
# word to a unique index. (Note: Indexing starts from 0)
# Example:
# tokenizer = Tokenizer()
# tokenizer.fit_on_texts(data)
# encoded = tokenizer.texts_to_sequences(data)

word_tokenizer = Tokenizer()
word_tokenizer.fit_on_texts(spooky_text_data)
word_encoded = word_tokenizer.texts_to_sequences(spooky_text_data)

char_tokenizer = Tokenizer(char_level=True)
char_tokenizer.fit_on_texts(spooky_text_data)
char_encoded = char_tokenizer.texts_to_sequences(spooky_text_data)

word_vocab_size = len(word_tokenizer.word_index)
char_vocab_size = len(char_tokenizer.word_index)

In [12]:
# print out the size of the word index for each of your tokenizers
# this should match what you calculated in Task 2 with your embeddings

print(word_vocab_size)
print(char_vocab_size)



25943
58


### b) Next, prepare the sequences to train your model from text (5 points)

#### Fixed n-gram based sequences

In [15]:
def generate_ngram_training_samples(encoded: list, ngram: int) -> list:
    '''
    Takes the encoded data (list of lists) and 
    generates the training samples out of it.
    Parameters:
    up to you, we've put in what we used
    but you can add/remove as needed
    return: 
    list of lists in the format [[x1, x2, ... , x(n-1), y], ...]
    '''
    samples = []
    for seq in encoded:
        for i in range(ngram, len(seq) + 1):
            samples.append(seq[i-ngram:i])
    return samples

# generate your training samples for both word and character data
# print out the first 5 training samples for each
# we have displayed the number of sequences
# to expect for both characters and words
#
# Spooky data by character should give 2957553 sequences
# [21, 21, 3]
# [21, 3, 9]
# [3, 9, 7]
# ...
# Spooky data by words shoud give 634080 sequences
# [1, 1, 32]
# [1, 32, 2956]
# [32, 2956, 3]
# ...

word_ngram_samples = generate_ngram_training_samples(word_encoded, NGRAM)
char_ngram_samples = generate_ngram_training_samples(char_encoded, NGRAM)
word_samples_preview = word_ngram_samples[:5]
char_samples_preview = char_ngram_samples[:5]
word_sequences_count = len(word_ngram_samples)
char_sequences_count = len(char_ngram_samples)

word_samples_preview, char_samples_preview, word_sequences_count, char_sequences_count

([[26, 2945, 143],
  [2945, 143, 1372],
  [143, 1372, 22],
  [1372, 22, 36],
  [22, 36, 294]],
 [[3, 9, 7], [9, 7, 8], [7, 8, 1], [8, 1, 20], [1, 20, 10]],
 483974,
 2879237)

### c) Then, split the sequences into X and y and create a Data Generator (20 points)

In [17]:
# 2.5 points

# Note here that the sequences were in the form: 
# sequence = [x1, x2, ... , x(n-1), y]
# We still need to separate it into [[x1, x2, ... , x(n-1)], ...], [y1, y2, ...]]
# do that here



# print out the shapes to verify that they are correct

def split_sequences(sequences):
    X = [seq[:-1] for seq in sequences]
    y = [seq[-1] for seq in sequences]
    return np.array(X), np.array(y)

X_word, y_word = split_sequences(word_ngram_samples)
X_char, y_char = split_sequences(char_ngram_samples)

print("Word-level shapes:")
print(f"X_word shape: {X_word.shape}, y_word shape: {y_word.shape}")

print("\nCharacter-level shapes:")
print(f"X_char shape: {X_char.shape}, y_char shape: {y_char.shape}")

Word-level shapes:
X_word shape: (483974, 2), y_word shape: (483974,)

Character-level shapes:
X_char shape: (2879237, 2), y_char shape: (2879237,)


In [18]:
# 2.5 points

# Initialize a function that reads the word embeddings you saved earlier
# and gives you back mappings from words to their embeddings and also 
# indexes from the tokenizers to their embeddings

def read_embeddings(filename: str, tokenizer: Tokenizer) -> (dict, dict):
    '''Loads and parses embeddings trained in earlier.
    Parameters:
        filename (str): path to file
        Tokenizer: tokenizer used to tokenize the data (needed to get the word to index mapping)
    Returns:
        (dict): mapping from word to its embedding vector
        (dict): mapping from index to its embedding vector
    '''
    word_to_embedding = {}
    index_to_embedding = {}

    index_to_embedding[0] = np.zeros(EMBEDDINGS_SIZE)
    with open(filename, 'r') as f:
        for line in f:
            values = line.split()
            word = values[0]
            vector = np.asarray(values[1:], dtype='float32')
            word_to_embedding[word] = vector

    for word, index in tokenizer.word_index.items():
        if word in word_to_embedding:
            index_to_embedding[index] = word_to_embedding[word]
        else:
            index_to_embedding[index] = np.random.uniform(-0.01, 0.01, EMBEDDINGS_SIZE)

    return word_to_embedding, index_to_embedding



In [19]:
# NECESSARY FOR CHARACTERS

# the "0" index of the Tokenizer is assigned for the padding token. Initialize
# the vector for padding token as all zeros of embedding size
# this adds one to the number of embeddings that were initially saved
# (and increases your vocab size by 1)

In [20]:
# 10 points

def data_generator(X: list, y: list, num_sequences_per_batch: int, index_2_embedding: dict, num_classes: int):
    '''
    Returns data generator to be used by feed_forward
    https://wiki.python.org/moin/Generators
    https://realpython.com/introduction-to-python-generators/
    
    Yields batches of embeddings and labels to go with them.
    Use one hot vectors to encode the labels 
    (see the to_categorical function)
    
    Returns data generator to be used by feed_forward
    '''
    num_samples = len(X)
    embedding_dim = next(iter(index_2_embedding.values())).shape[0]
    sequence_length = len(X[0])

    while True:
        indices = np.arange(num_samples)
        np.random.shuffle(indices)
        X_shuffled = [X[i] for i in indices]
        y_shuffled = [y[i] for i in indices]

        for offset in range(0, num_samples, num_sequences_per_batch):
            end = offset + num_sequences_per_batch
            X_batch_indices = X_shuffled[offset:end]
            y_batch_indices = y_shuffled[offset:end]

            current_batch_size = len(X_batch_indices)

            X_batch = np.zeros((current_batch_size, sequence_length, embedding_dim), dtype='float32')
            y_batch = np.zeros((current_batch_size, num_classes), dtype='float32')

            for i, seq in enumerate(X_batch_indices):
                for j, index in enumerate(seq):
                    X_batch[i, j] = index_2_embedding.get(index, np.zeros(embedding_dim))
                y_batch[i] = to_categorical(y_batch_indices[i], num_classes=num_classes)
            yield X_batch, y_batch

In [21]:
# 5 points

# initialize your data_generator for both word and character data
# print out the shapes of the first batch to verify that it is correct for both word and character data

# Examples:
# num_sequences_per_batch = 128 # this is the batch size
# steps_per_epoch = len(sequences)//num_sequences_per_batch  # Number of batches per epoch
# train_generator = data_generator(X, y, num_sequences_per_batch)

# sample=next(train_generator) # this is how you get data out of generators
# sample[0].shape # (batch_size, (n-1)*EMBEDDING_SIZE)  (128, 200)
# sample[1].shape   # (batch_size, |V|) to_categorical



# Define the batch size
num_sequences_per_batch = 128
word_embeddings_file = 'spooky_embedding_word.txt'
_, index_to_embedding_word = read_embeddings(word_embeddings_file, word_tokenizer)
char_embeddings_file = 'spooky_embedding_char.txt'
_, index_to_embedding_char = read_embeddings(char_embeddings_file, char_tokenizer)
vocab_size_word = len(word_tokenizer.word_index)
num_classes_word = vocab_size_word
vocab_size_char = len(char_tokenizer.word_index)
num_classes_char = vocab_size_char
word_train_generator = data_generator(
    X_word, y_word, num_sequences_per_batch, index_to_embedding_word, num_classes_word
)

char_train_generator = data_generator(
    X_char, y_char, num_sequences_per_batch, index_to_embedding_char, num_classes_char
)

word_sample = next(word_train_generator)
print("Word sample shapes:")
print("Input shape:", word_sample[0].shape)
print("Label shape:", word_sample[1].shape)
char_sample = next(char_train_generator)
print("\nCharacter sample shapes:")
print("Input shape:", char_sample[0].shape)
print("Label shape:", char_sample[1].shape)

Word sample shapes:
Input shape: (128, 2, 50)
Label shape: (128, 25944)

Character sample shapes:
Input shape: (128, 2, 50)
Label shape: (128, 59)


### d) Train & __save__ your models (15 points)

In [23]:
# 15 points 

# code to train a feedforward neural language model for 
# both word embeddings and character embeddings
# make sure not to just copy + paste to train your two models
# (define functions as needed)

# train your models for between 3 & 5 epochs
# on our machine, this takes ~ 24 min for character embeddings and ~ 10 min for word embeddings
# DO NOT EXPECT ACCURACIES OVER 0.5 (and even that is very for this many epochs)
# We recommend starting by training for 1 epoch

# Define your model architecture using Keras Sequential API
# Use the adam optimizer instead of sgd
# add cells as desired

def build_feedforward_model(input_shape, num_classes):
    model = Sequential()
    model.add(InputLayer(shape=input_shape))
    model.add(Flatten())
    model.add(Dense(128, activation='relu'))
    model.add(Dense(num_classes, activation='softmax'))
    model.compile(optimizer=Adam(), loss='categorical_crossentropy', metrics=['accuracy'])
    return model

steps_per_epoch_word = len(X_word) // num_sequences_per_batch
sequence_length_word = X_word.shape[1]
embedding_dim_word = EMBEDDINGS_SIZE
input_shape_word = (sequence_length_word, embedding_dim_word)
vocab_size_word = len(word_tokenizer.word_index) + 1 
num_classes_word = vocab_size_word
word_train_generator = data_generator(X_word, y_word, num_sequences_per_batch, index_to_embedding_word, num_classes_word)
model_word = build_feedforward_model(input_shape_word, num_classes_word)



steps_per_epoch_char = len(X_char) // num_sequences_per_batch
sequence_length_char = X_char.shape[1]
embedding_dim_char = EMBEDDINGS_SIZE
input_shape_char = (sequence_length_char, embedding_dim_char)
vocab_size_char = len(char_tokenizer.word_index) + 1
num_classes_char = vocab_size_char
char_train_generator = data_generator(X_char, y_char, num_sequences_per_batch, index_to_embedding_char, num_classes_char)
model_char = build_feedforward_model(input_shape_char, num_classes_char)

In [24]:
# Here is some example code to train a model with a data generator
# model.fit(x=train_generator, 
#           steps_per_epoch=steps_per_epoch,
#           epochs=1)

In [25]:

# spooky data model by character for 5 epochs takes ~ 24 min on our computer
# with adam optimizer, gets accuracy of 0.3920

# spooky data model by word for 5 epochs takes 10 min on our computer
# results in accuracy of 0.2110

model_word.fit(
    x=word_train_generator,
    steps_per_epoch=steps_per_epoch_word,
    epochs=5
)
model_char.fit(
    x=char_train_generator,
    steps_per_epoch=steps_per_epoch_char,
    epochs=5
)

Epoch 1/5
[1m3781/3781[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m216s[0m 57ms/step - accuracy: 0.1058 - loss: 6.9168
Epoch 2/5
[1m3781/3781[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m186s[0m 49ms/step - accuracy: 0.1342 - loss: 6.0239
Epoch 3/5
[1m3781/3781[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m160s[0m 42ms/step - accuracy: 0.1381 - loss: 5.7224
Epoch 4/5
[1m3781/3781[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m159s[0m 42ms/step - accuracy: 0.1408 - loss: 5.4905
Epoch 5/5
[1m3781/3781[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m159s[0m 42ms/step - accuracy: 0.1426 - loss: 5.3033
Epoch 1/5
[1m22494/22494[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m91s[0m 4ms/step - accuracy: 0.3439 - loss: 2.2361
Epoch 2/5
[1m22494/22494[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m92s[0m 4ms/step - accuracy: 0.3787 - loss: 2.0218
Epoch 3/5
[1m22494/22494[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m92s[0m 4ms/step - accuracy: 0.3812 - loss: 2.0027
Epoch 4/

<keras.src.callbacks.history.History at 0x2544bb422a0>

In [26]:
# save your trained models so you can re-load instead of re-training each time
# also, you'll need these to generate your sentences!



model_word.save("word_language_model.h5")
model_char.save("char_language_model.h5")



### e) Generate Sentences (15 points)

In [28]:
# load your models if you need to
loaded_word_model = load_model("word_language_model.h5")
loaded_char_model = load_model("char_language_model.h5")



In [37]:
# 10 points

# # generate a sequence from the model until you get an end of sentence token
# This is an example function header you might use
def generate_seq(model, tokenizer, seed, index_to_embedding, max_length=50, end_token='</s>'):
    result_sequence = seed.copy()
    context_window = seed.copy()

    embedding_dim = next(iter(index_to_embedding.values())).shape[0]
    sequence_length = len(context_window)
    index_to_word = {index: word for word, index in tokenizer.word_index.items()}
    index_to_word[0] = '<PAD>'

    eos_index = tokenizer.word_index.get(end_token)
    if eos_index is None:
        eos_index = None

    for _ in range(max_length):
        input_embeddings = np.array([
            index_to_embedding.get(idx, np.zeros(embedding_dim)) for idx in context_window
        ])
        input_embeddings = input_embeddings.reshape(1, sequence_length, embedding_dim)

        yhat = model.predict(input_embeddings, verbose=0)
        next_index = np.argmax(yhat, axis=-1)[0]

        result_sequence.append(next_index)

        if eos_index is not None and next_index == eos_index:
            break

        context_window.append(next_index)
        context_window = context_window[1:]

    generated_words = [index_to_word.get(idx, '<UNK>') for idx in result_sequence]
    generated_words = [word for word in generated_words if word not in ('<s>', '</s>')]
    generated_sentence = ' '.join(generated_words).replace('_', ' ')

    return generated_sentence

In [66]:
# 5 points

# generate and display one sequence from both the word model and the character model
# do not include <s> or </s> in your displayed sentences
# make sure that you can read the output easily (i.e. don't just print out a list of tokens)

# you may leave _ as _ or replace it with a space if you prefer

seed_sequence_word = X_word[0].tolist()

generated_sentence_word = generate_seq(
    model=model_word,
    tokenizer=word_tokenizer,
    seed=seed_sequence_word,
    index_to_embedding=index_to_embedding_word,
    max_length=20,
    end_token='</s>' 
)

print("Generated sentence from the word model:")
print(generated_sentence_word)


seed_sequence_char = X_char[0].tolist()

generated_sentence_char = generate_seq(
    model=model_char,
    tokenizer=char_tokenizer,
    seed=seed_sequence_char,
    index_to_embedding=index_to_embedding_char,
    max_length=50,
    end_token='</s>'
)

print("\nGenerated sentence from the character model:")
print(generated_sentence_char)

Generated sentence from the word model:
this process was not to be sure to be sure to be sure to be sure to be sure to be sure

Generated sentence from the character model:
t h e   t h e   t h e   t h e   t h e   t h e   t h e   t h e   t h e   t h e   t h e   t h e   t h e  


In [57]:
# generate 100 example sentences with each model and save them to a file, one sentence per line
# do not include <s> and </s> in your saved sentences (you'll use these sentences in your next task)
# this will produce two files, one for each model
def generate_sentences(model, tokenizer, X_data, index_to_embedding, num_sentences, max_length=20, end_token='</s>'):
    sentences = []

    for _ in range(num_sentences):
        seed_sequence = random.choice(X_data).tolist()
        generated_sequence = generate_seq(
            model=model,
            tokenizer=tokenizer,
            seed=seed_sequence,
            index_to_embedding=index_to_embedding,
            max_length=max_length,
            end_token=end_token
        )

        sentences.append(generated_sequence)

    return sentences

num_sentences = 100
max_length_word = 20
generated_sentences_word = generate_sentences(
    model=model_word,
    tokenizer=word_tokenizer,
    X_data=X_word,
    index_to_embedding=index_to_embedding_word,
    num_sentences=num_sentences,
    max_length=max_length_word,
    end_token='</s>'
)

output_file_word = 'generated_sentences_word.txt'
with open(output_file_word, 'w', encoding='utf-8') as f:
    for sentence in generated_sentences_word:
        sentence_clean = ' '.join([word for word in sentence.split() if word not in ('<s>', '</s>')])
        f.write(sentence_clean + '\n')

max_length_char = 50
generated_sentences_char = generate_sentences(
    model=model_char,
    tokenizer=char_tokenizer,
    X_data=X_char,
    index_to_embedding=index_to_embedding_char,
    num_sentences=num_sentences,
    max_length=max_length_char,
    end_token='</s>'
)

output_file_char = 'generated_sentences_char.txt'
with open(output_file_char, 'w', encoding='utf-8') as f:
    for sentence in generated_sentences_char:
        sentence_clean = ' '.join([char for char in sentence.split() if char not in ('<s>', '</s>')])
        f.write(sentence_clean + '\n')