In [40]:
pip install nltk

Note: you may need to restart the kernel to use updated packages.


In [104]:
import tensorflow as tf
import numpy as np

import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/selinawang/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/selinawang/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Preprocessing


questions:
- what to do with stop words?

### Read data

In [2]:
with open('pg1597.txt') as f:
    lines = f.readlines()
print(lines[:6])

["THE EMPEROR'S NEW CLOTHES\n", '\n', 'Many years ago, there was an Emperor, who was so excessively fond of\n', 'new clothes, that he spent all his money in dress. He did not trouble\n', 'himself in the least about his soldiers; nor did he care to go either to\n', 'the theatre or the chase, except for the opportunities then afforded him\n']


### Preprocess data

In [73]:
def preprocess(line):
    data = "".join(line)
    data = word_tokenize(data)
    words = " ".join(data)
    lower_w = words.lower()

    stop_words = set(stopwords.words('english')) 
    word_tokens = lower_w.split(" ")
  
    filtered_sentence = [w for w in word_tokens if not w in stop_words] 
    
    ps = PorterStemmer()
    output = []
    for word in filtered_sentence:
        output.append((ps.stem(word)))
    final_out = " ".join(output)
    return final_out

### get vocabulary

In [167]:
def get_vocab(data):
    data = data.replace('\n', ' ').split(' ')
    words = list(set(data))
    vocabulary = {word:index for index, word in enumerate(words)}
    vocab_size = len(vocabulary)
    
    return vocabulary, vocab_size
    

In [168]:
train_data = preprocess(lines)
vocabulary, vocab_size = get_vocab(train_data)

In [169]:
vocab_size

4119

In [170]:
vocabulary

{'spoil': 0,
 'torrent': 1,
 'told': 2,
 'inquir': 3,
 'brass': 4,
 'root': 5,
 'rare': 6,
 'float': 7,
 "one'": 8,
 'bluish': 9,
 'coach': 10,
 'inward': 11,
 'breast': 12,
 'tuck': 13,
 'five': 14,
 'patrol': 15,
 'bearer': 16,
 'sad': 17,
 'inexhaust': 18,
 'large-s': 19,
 'it.': 20,
 'tend': 21,
 'banner': 22,
 'necessari': 23,
 'leap': 24,
 'me.': 25,
 'oil': 26,
 'vaudevil': 27,
 'forgot': 28,
 'peer': 29,
 'griev': 30,
 'heed': 31,
 'wall': 32,
 'thanke': 33,
 'mirror': 34,
 'nybod': 35,
 'curios': 36,
 'chamber': 37,
 'shop-window': 38,
 '!': 39,
 'granni': 40,
 'reveal': 41,
 'outspread': 42,
 'pay': 43,
 'gain': 44,
 'now-a-day': 45,
 'broke': 46,
 'curl': 47,
 'medicin': 48,
 'reli': 49,
 'end': 50,
 'rain-wat': 51,
 'pistol': 52,
 'war': 53,
 'uniform': 54,
 'else.': 55,
 'whisper': 56,
 'inquisit': 57,
 'dozen': 58,
 'front': 59,
 'last': 60,
 'thin': 61,
 'flush': 62,
 'speak': 63,
 'morgana': 64,
 'forehead': 65,
 'late': 66,
 'debt.': 67,
 'templ': 68,
 'deepli': 69,
 '

## Bi-directional LSTM Masked Language Modeling

references: 

https://keras.io/examples/nlp/masked_language_modeling/#create-bert-model-pretraining-model-for-masked-language-modeling

https://www.kaggle.com/code/ritvik1909/masked-language-modelling-rnn#Data-Preparation

https://keras.io/examples/nlp/bidirectional_lstm_imdb/

questions:
- should we split data by sentence instead of by fixed window size of 20?


### more data preparation

In [132]:
# add [mask] to vocabulary
mask_id = vocab_size
vocabulary['[mask]'] = mask_id

In [133]:
# convert words to vectors
vectorized_text = list(map(lambda x: vocabulary[x], train_data))
vectorized_text = np.array(y)

In [134]:
# split data into sequences of length 20
vectorized_text_len = len(vectorized_text) - (len(vectorized_text) % 20)
vectorized_text = vectorized_text[:vectorized_text_len]
vectorized_text = np.reshape(vectorized_text,[-1,20])

In [135]:
vectorized_text

array([[1567, 3934,  413, ...,  868, 2541, 2780],
       [2465, 1168, 1045, ..., 1432,  272, 1297],
       [1045, 3584, 1567, ..., 1567, 3330, 2471],
       ...,
       [3280, 2445, 1946, ..., 1739, 3885, 3205],
       [1481, 2714, 3265, ..., 2701,  874, 3731],
       [3577, 3205, 1542, ..., 2541, 1792,   32]])

In [171]:
def get_masked_input_label(sequence):
    
    # randomly choose one position in sequence to mask
    mask = np.random.randint(low=0, high=20)
    
    # add mask to input
    masked_input = [token if i != mask else mask_id for i, token in enumerate(sequence)]
    
    # set all values in label to -1(ignored by loss function) except the value at the masked position
    label = [-1 if i!= mask else token for i, token in enumerate(sequence)]
    return masked_input, label


In [152]:
# get masked inputs and labels
inputs = []
labels = []

for seq in vectorized_text:
    x,y = get_masked_input_label(seq)
    inputs.append(x)
    labels.append(y)
inputs = np.array(inputs)
labels = np.array(labels)
    

In [155]:
inputs.shape, labels.shape

((1820, 20), (1820, 20))

In [156]:
labels

array([[  -1,   -1,   -1, ...,   -1,   -1,   -1],
       [  -1,   -1,   -1, ...,   -1,   -1,   -1],
       [  -1,   -1,   -1, ...,   -1, 3330,   -1],
       ...,
       [  -1,   -1,   -1, ...,   -1,   -1,   -1],
       [  -1,   -1,   -1, ...,   -1,   -1,   -1],
       [  -1,   -1,   -1, ...,   -1, 1792,   -1]])

### bi-directional lstm model building and training

In [116]:
# define masked language modeling class
class LSTM_MLM(tf.keras.Model):
    def __init__(self, vocab_size, embed_size, input_length):
        """
        The Model class predicts the next words in a sequence.
        : param vocab_size : The number of unique words in the data
        : param hidden_size   : The size of your desired RNN
        : param embed_size : The size of your latent embedding
        """

        super().__init__()

        self.vocab_size = vocab_size
        self.embed_size = embed_size
        self.input_length = input_length

        ## TODO: define your trainable variables and/or layers here. This should include an
        ## embedding component, and any other variables/layers you require.

        # embedding layer
        self.embedding = tf.keras.layers.Embedding(input_dim=self.vocab_size+1, output_dim=self.embed_size)
        self.lstm = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(128, return_sequences=True))
        self.dense1 = tf.keras.layers.Dense(self.vocab_size, activation='softmax')

        # fully connected linear layers


    def call(self, inputs):
        """
        You must use an embedding layer as the first layer of your network (i.e. tf.nn.embedding_lookup or tf.keras.layers.Embedding)
        :param inputs: word ids of shape (batch_size, 2)
        :return: logits: The batch element probabilities as a tensor of shape (batch_size, vocab_size)
        """

        # embedding layer
        x = inputs
        
        x = self.embedding(x)
        x = self.lstm(x)
        x = self.dense1(x)

        
        return x


In [159]:
model = LSTM_MLM(vocab_size, 64, 20)
loss_metric = tf.keras.losses.SparseCategoricalCrossentropy(ignore_class=-1)
model.compile(loss=loss_metric, optimizer='adam', metrics=['acc'])
model.fit(x=inputs, y=labels, validation_split=0.1, batch_size=20, epochs=50)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.History at 0x7fd82f3788b0>

In [172]:
# make prediction: still in progress

#query = ['emperor', 'like', '[mask]', 'cloth', 'dress']
#query_id = [vocabulary[q] for q in query]
query_id = x[0:2]

pred = model(query_id)
pred

AttributeError: Exception encountered when calling layer 'embedding_13' (type Embedding).

'list' object has no attribute 'dtype'

Call arguments received by layer 'embedding_13' (type Embedding):
  • inputs=['3577', '3205']