In [40]:
pip install nltk

Note: you may need to restart the kernel to use updated packages.


In [1]:
import tensorflow as tf
import numpy as np

import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

2023-04-22 13:47:33.014320: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/selinawang/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/selinawang/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Preprocessing


- removed gender pronouns from list of stopwords

### Read data

In [3]:
with open('../data/pg1597.txt') as f:
    lines = f.readlines()
print(lines[:6])

["THE EMPEROR'S NEW CLOTHES\n", '\n', 'Many years ago, there was an Emperor, who was so excessively fond of\n', 'new clothes, that he spent all his money in dress. He did not trouble\n', 'himself in the least about his soldiers; nor did he care to go either to\n', 'the theatre or the chase, except for the opportunities then afforded him\n']


### Preprocess data

In [116]:
def preprocess(line):
    data = "".join(line)
    data = word_tokenize(data)
    words = " ".join(data)
    lower_w = words.lower()
    
    # keep gender pronouns in stop words
    stop_words_keep = ['he',
     'him',
     'his',
     'himself',
     'she',
     "she's",
     'her',
     'hers',
     'herself']
    stop_words = [word for word in stopwords.words('english') if word not in stop_words_keep]

    stop_words = set(stop_words) 
    word_tokens = lower_w.split(" ")
  
    filtered_sentence = [w for w in word_tokens if not w in stop_words] 
    
    ps = PorterStemmer()
    output = []
    for word in filtered_sentence:
        output.append((ps.stem(word)))
    final_out = " ".join(output)
    final_out = final_out.replace('\n', ' ').split(' ')
    return final_out

### get vocabulary

In [117]:
def get_vocab(data):
    words = list(set(data))
    vocabulary = {word:index for index, word in enumerate(words)}
    vocab_size = len(vocabulary)
    
    return vocabulary, vocab_size
    

In [118]:
train_data = preprocess(lines)
vocabulary, vocab_size = get_vocab(train_data)

In [119]:
vocab_size, vocabulary

(4126,
 {'among': 0,
  'bank': 1,
  'bout': 2,
  'fata': 3,
  'ascend': 4,
  'wear': 5,
  'bearer': 6,
  'crumbl': 7,
  'rose': 8,
  'crack': 9,
  'water-pl': 10,
  'grain': 11,
  'weather-cock': 12,
  'climb': 13,
  'soundli': 14,
  'purchas': 15,
  'hurl': 16,
  'absent': 17,
  'a.': 18,
  'leapt': 19,
  'cup': 20,
  'couldst': 21,
  'militia': 22,
  'uppermost': 23,
  'tread': 24,
  'half-dress': 25,
  'colder': 26,
  'went': 27,
  'belov': 28,
  'exist': 29,
  'frozen': 30,
  'throne': 31,
  'rabbit': 32,
  'shelv': 33,
  'lid': 34,
  'becom': 35,
  'freez': 36,
  'farthest': 37,
  '1484': 38,
  'wild-dron': 39,
  'mighti': 40,
  'china': 41,
  'spinning-wheel': 42,
  'crafti': 43,
  'encount': 44,
  'stair': 45,
  'queen': 46,
  'hastili': 47,
  'signifi': 48,
  'asund': 49,
  'vein': 50,
  'nurs': 51,
  'adjoin': 52,
  'cold': 53,
  'imbib': 54,
  'supple.': 55,
  'sake': 56,
  'rate': 57,
  'tell': 58,
  'scrubbi': 59,
  'unknown': 60,
  'terrif': 61,
  'brim': 62,
  'feel': 63,

## Bi-directional LSTM Masked Language Modeling

references: 

https://keras.io/examples/nlp/masked_language_modeling/#create-bert-model-pretraining-model-for-masked-language-modeling

https://www.kaggle.com/code/ritvik1909/masked-language-modelling-rnn#Data-Preparation

https://keras.io/examples/nlp/bidirectional_lstm_imdb/

questions:
- should we split data by sentence instead of by fixed window size of 20?


### more data preparation

In [120]:
# convert words to vectors
vectorized_text = list(map(lambda x: vocabulary[x], train_data))
vectorized_text = np.array(vectorized_text)

# add [mask] to vocabulary
mask_id = vocab_size
vocabulary['[mask]'] = mask_id

# split data into sequences of length 20
vectorized_text_len = len(vectorized_text) - (len(vectorized_text) % 20)
vectorized_text = vectorized_text[:vectorized_text_len]
vectorized_text = np.reshape(vectorized_text,[-1,20])

In [121]:
vectorized_text

array([[2253, 3282, 2465, ..., 1829, 3713, 3930],
       [4029, 3383, 1800, ..., 2230, 2040, 3441],
       [1829, 2465, 3398, ...,  875, 3383,  750],
       ...,
       [ 518, 1709, 3282, ...,  908, 1174,  875],
       [1787,  908, 1174, ..., 3839,  214, 1523],
       [2021, 1966,  518, ...,  628, 2073, 1419]])

In [122]:
def mask_one_input_label(sequence):
    
    # randomly choose one position in sequence to mask
    mask = np.random.randint(low=0, high=20)
    
    # add mask to input
    masked_input = [token if i != mask else mask_id for i, token in enumerate(sequence)]
    
    # set all values in label to -1(ignored by loss function) except the value at the masked position
    label = [-1 if i!= mask else token for i, token in enumerate(sequence)]
    return masked_input, label


In [123]:
# get masked inputs and labels
def get_masked_inputs_labels(text):
    inputs = []
    labels = []

    for seq in text:
        x,y = mask_one_input_label(seq)
        inputs.append(x)
        labels.append(y)
    inputs = np.array(inputs)
    labels = np.array(labels)
    
    return inputs, labels


In [124]:
inputs, labels = get_masked_inputs_labels(vectorized_text)

### bi-directional lstm model building and training

In [126]:
# define masked language modeling class
class LSTM_MLM(tf.keras.Model):
    def __init__(self, vocab_size, embed_size, input_length):
        """
        The Model class predicts the next words in a sequence.
        : param vocab_size : The number of unique words in the data
        : param hidden_size   : The size of your desired RNN
        : param embed_size : The size of your latent embedding
        """

        super().__init__()

        self.vocab_size = vocab_size
        self.embed_size = embed_size
        self.input_length = input_length

        ## TODO: define your trainable variables and/or layers here. This should include an
        ## embedding component, and any other variables/layers you require.

        # embedding layer
        self.embedding = tf.keras.layers.Embedding(input_dim=self.vocab_size+1, output_dim=self.embed_size)
        self.lstm = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(128, return_sequences=True))
        self.dense1 = tf.keras.layers.Dense(self.vocab_size, activation='softmax')

        # fully connected linear layers


    def call(self, inputs):
        """
        You must use an embedding layer as the first layer of your network (i.e. tf.nn.embedding_lookup or tf.keras.layers.Embedding)
        :param inputs: word ids of shape (batch_size, 2)
        :return: logits: The batch element probabilities as a tensor of shape (batch_size, vocab_size)
        """

        # embedding layer
        x = inputs
        
        x = self.embedding(x)
        x = self.lstm(x)
        x = self.dense1(x)

        
        return x


In [127]:
model = LSTM_MLM(vocab_size, 64, 20)
loss_metric = tf.keras.losses.SparseCategoricalCrossentropy(ignore_class=-1)
# accuracy is not a good measure
model.compile(loss=loss_metric, optimizer='adam')
model.fit(x=inputs, y=labels, batch_size=20, epochs=20) 
# we do not need validation because our purpose is only to learn the patterns in our training data

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x7f8abf84fdf0>

### get predicted probability

In [128]:
# make prediction: still in progress

def get_predicted_probability(masked_sentence, target_word):
    masked_sentence = masked_sentence.split(' ')
    mask_loc = masked_sentence.index('[mask]')
    target_id = vocabulary[target_word]
    query_id = [vocabulary[q] for q in masked_sentence]

    query_id = tf.expand_dims(query_id, axis=0)
    pred = model(query_id)[:,mask_loc, target_id]
    return pred


In [129]:
test_sentence = '[mask] like beauti dress'

In [132]:
get_predicted_probability(test_sentence, 'she')

<tf.Tensor: shape=(1,), dtype=float32, numpy=array([0.02857665], dtype=float32)>

In [133]:
get_predicted_probability(test_sentence, 'he')

<tf.Tensor: shape=(1,), dtype=float32, numpy=array([0.01522405], dtype=float32)>

In [185]:
get_predicted_probability(test_sentence, 'queen')

<tf.Tensor: shape=(1,), dtype=float32, numpy=array([0.00066057], dtype=float32)>

In [186]:
get_predicted_probability(test_sentence, 'king')

<tf.Tensor: shape=(1,), dtype=float32, numpy=array([3.19514e-05], dtype=float32)>

In [177]:
test_sentence = 'evil old [mask]'

In [180]:
get_predicted_probability(test_sentence, 'man')

<tf.Tensor: shape=(1,), dtype=float32, numpy=array([1.3034821e-05], dtype=float32)>

In [181]:
get_predicted_probability(test_sentence, 'woman')

<tf.Tensor: shape=(1,), dtype=float32, numpy=array([6.0170372e-05], dtype=float32)>

In [189]:
test_sentence = 'pretti [mask]'
get_predicted_probability(test_sentence, 'girl')

<tf.Tensor: shape=(1,), dtype=float32, numpy=array([2.3883402e-05], dtype=float32)>

In [190]:
get_predicted_probability(test_sentence, 'boy')

<tf.Tensor: shape=(1,), dtype=float32, numpy=array([0.00011371], dtype=float32)>

### access embedding layer:

In [192]:
embeddings = model.layers[0].get_weights()[0]
embeddings.shape

(4127, 64)

### testing lstm model on HW4 data:

In [44]:
file = open('../data/hw4_train.txt', "r")

hw4_data = file.read()
hw4_data = hw4_data.replace('\n', ' ').split(' ')

file.close()

In [49]:
hw4_vocabulary, hw4_vocab_size = get_vocab(hw4_data)

In [50]:
# convert words to vectors
hw4_vectorized_text = list(map(lambda x: hw4_vocabulary[x], hw4_data))
hw4_vectorized_text = np.array(hw4_vectorized_text)

# add [mask] to vocabulary
mask_id = vocab_size
hw4_vocabulary['[mask]'] = mask_id

# split data into sequences of length 20
hw4_vectorized_text_len = len(hw4_vectorized_text) - (len(hw4_vectorized_text) % 20)
hw4_vectorized_text = hw4_vectorized_text[:hw4_vectorized_text_len]
hw4_vectorized_text = np.reshape(hw4_vectorized_text,[-1,20])

In [52]:
hw4_inputs, hw4_labels = get_masked_inputs_labels(hw4_vectorized_text)

In [193]:
# testing model performance on hw4 data:
# model = LSTM_MLM(hw4_vocab_size, 64, 20)
# loss_metric = tf.keras.losses.SparseCategoricalCrossentropy(ignore_class=-1)
# model.compile(loss=loss_metric, optimizer='adam')
# model.fit(x=hw4_inputs, y=hw4_labels, batch_size=20, epochs=50)

## Transformers

references: "Attention Is All You Need" paper by Vaswani et al.

In [None]:
# Multi-head self-attention
class MultiHeadSelfAttention(tf.keras.layers.Layer):
    def __init__(self, embed_dim, num_heads):
        super(MultiHeadSelfAttention, self).__init__()
        self.embed_dim = embed_dim
        self.num_heads = num_heads
        assert self.embed_dim % self.num_heads == 0, "Embedding dimension must be divisible by the number of heads."

        self.projection_dim = embed_dim // num_heads
        self.query_dense = tf.keras.layers.Dense(embed_dim)
        self.key_dense = tf.keras.layers.Dense(embed_dim)
        self.value_dense = tf.keras.layers.Dense(embed_dim)
        self.combine_heads = tf.keras.layers.Dense(embed_dim)

    def attention(self, query, key, value):
        score = tf.matmul(query, key, transpose_b=True)
        dim_key = tf.cast(tf.shape(key)[-1], tf.float32)
        scaled_score = score / tf.math.sqrt(dim_key)
        weights = tf.nn.softmax(scaled_score, axis=-1)
        output = tf.matmul(weights, value)
        return output, weights

    def separate_heads(self, x, batch_size):
        x = tf.reshape(x, (batch_size, -1, self.num_heads, self.projection_dim))
        return tf.transpose(x, perm=[0, 2, 1, 3])

    def call(self, inputs):
        batch_size = tf.shape(inputs)[0]
        query = self.query_dense(inputs)
        key = self.key_dense(inputs)
        value = self.value_dense(inputs)
        query = self.separate_heads(query, batch_size)
        key = self.separate_heads(key, batch_size)
        value = self.separate_heads(value, batch_size)

        attention, weights = self.attention(query, key, value)
        attention = tf.transpose(attention, perm=[0, 2, 1, 3])
        concat_attention = tf.reshape(attention, (batch_size, -1, self.embed_dim))
        output = self.combine_heads(concat_attention)
        return output

In [None]:
# Transformer block
class TransformerBlock(tf.keras.layers.Layer):
    def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1):
        super(TransformerBlock, self).__init__()
        self.att = MultiHeadSelfAttention(embed_dim, num_heads)
        self.ffn = tf.keras.Sequential(
            [tf.keras.layers.Dense(ff_dim, activation="relu"),
             tf.keras.layers.Dense(embed_dim)]
        )
        self.layernorm1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.dropout1 = tf.keras.layers.Dropout(rate)
        self.dropout2 = tf.keras.layers.Dropout(rate)

    def call(self, inputs, training):
        attn_output = self.att(inputs)
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(inputs + attn_output)
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output, training=training)
        return self.layernorm2(out1 + ffn_output)

In [None]:
# Transformer model
class Transformer(tf.keras.Model):
    def __init__(self, vocab_size, input_length, embed_dim, num_heads, ff_dim, num_blocks, rate=0.1):
        super(Transformer, self).__init__()

        self.embedding = tf.keras.layers.Embedding(vocab_size + 1, embed_dim)
        self.pos_encoding = self.positional_encoding(input_length, embed_dim)
        self.transformer_blocks = [TransformerBlock(embed_dim, num_heads, ff_dim, rate) for _ in range(num_blocks)]
        self.dropout = tf.keras.layers.Dropout(rate)
        self.dense = tf.keras.layers.Dense(vocab_size, activation="softmax")

    def positional_encoding(self, input_length, embed_dim):
        angles = 1 / (10000 ** (tf.range(0, embed_dim, 2, dtype=tf.float32) / embed_dim))
        pos_encodings = tf.range(input_length, dtype=tf.float32)[:, tf.newaxis] * angles
        pos_encodings = tf.concat([tf.sin(pos_encodings), tf.cos(pos_encodings)], axis=-1)
        pos_encodings = pos_encodings[tf.newaxis, ...]
        return tf.cast(pos_encodings, tf.float32)

    def call(self, inputs, training):
        seq_length = inputs.shape[1]
        embeddings = self.embedding(inputs)
        embeddings *= tf.math.sqrt(tf.cast(self.embedding.output_dim, tf.float32))
        embeddings += self.pos_encoding[:, :seq_length, :]

        x = self.dropout(embeddings, training=training)

        for transformer_block in self.transformer_blocks:
            x = transformer_block(x, training)

        logits = self.dense(x)
        return logits

In [None]:
# Hyperparameters
vocab_size = 10000
input_length = 20
embed_dim = 64
num_heads = 4
ff_dim = 128
num_blocks = 4

# Instantiate the model
model_t = Transformer(vocab_size, input_length, embed_dim, num_heads, ff_dim, num_blocks)

# Set up the loss function and optimizer
loss_metric = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, ignore_index=-1)
optimizer = tf.keras.optimizers.Adam()

model_t.compile(loss=loss_metric, optimizer=optimizer, metrics=['accuracy'])

# Train the model
model_t.fit(x=inputs, y=labels, validation_split=0.1, batch_size=20, epochs=50)