In [6]:
import tensorflow as tf
import numpy as np
import pickle

In [3]:
# get preprocessed data:
train_file = '../preprocessed_texts.txt'
file = open(train_file, "r")

train_data = file.read()
train_data = train_data.split(' ')

file.close()


In [14]:
# get vocabulary:
with open('../vocabulary.pkl', 'rb') as fp:
    vocabulary = pickle.load(fp)
    
vocab_size = len(vocabulary)

## Bi-directional LSTM Masked Language Modeling

references: 

https://keras.io/examples/nlp/masked_language_modeling/#create-bert-model-pretraining-model-for-masked-language-modeling

https://www.kaggle.com/code/ritvik1909/masked-language-modelling-rnn#Data-Preparation

https://keras.io/examples/nlp/bidirectional_lstm_imdb/

questions:
- should we split data by sentence instead of by fixed window size of 20?


### more data preparation

In [18]:
# convert words to vectors
vectorized_text = list(map(lambda x: vocabulary[x], train_data))
vectorized_text = np.array(vectorized_text)

# add [mask] to vocabulary
mask_id = vocab_size
vocabulary['[mask]'] = mask_id

# split data into sequences of length 20
vectorized_text_len = len(vectorized_text) - (len(vectorized_text) % 20)
vectorized_text = vectorized_text[:vectorized_text_len]
vectorized_text = np.reshape(vectorized_text,[-1,20])

In [19]:
vectorized_text

array([[4556,  986, 4556, ..., 1696, 4015,    0],
       [ 718, 4250, 3636, ...,    0, 4556, 1095],
       [   0, 4556, 4556, ..., 1280, 4556, 4556],
       ...,
       [1533,  822, 2609, ..., 1954, 1778, 1731],
       [1449, 2609,    0, ..., 4556, 2856, 2622],
       [4580,    0,  349, ..., 4309, 4556,  165]])

In [20]:
def mask_one_input_label(sequence):
    
    # randomly choose one position in sequence to mask
    mask = np.random.randint(low=0, high=20)
    
    # add mask to input
    masked_input = [token if i != mask else mask_id for i, token in enumerate(sequence)]
    
    # set all values in label to -1(ignored by loss function) except the value at the masked position
    label = [-1 if i!= mask else token for i, token in enumerate(sequence)]
    return masked_input, label


In [21]:
# get masked inputs and labels
def get_masked_inputs_labels(text):
    inputs = []
    labels = []

    for seq in text:
        x,y = mask_one_input_label(seq)
        inputs.append(x)
        labels.append(y)
    inputs = np.array(inputs)
    labels = np.array(labels)
    
    return inputs, labels


In [22]:
inputs, labels = get_masked_inputs_labels(vectorized_text)

### bi-directional lstm model building and training

In [14]:
# define masked language modeling class
class LSTM_MLM(tf.keras.Model):
    def __init__(self, vocab_size, embed_size, input_length):
        """
        The Model class predicts the next words in a sequence.
        : param vocab_size : The number of unique words in the data
        : param hidden_size   : The size of your desired RNN
        : param embed_size : The size of your latent embedding
        """

        super().__init__()

        self.vocab_size = vocab_size
        self.embed_size = embed_size
        self.input_length = input_length

        ## TODO: define your trainable variables and/or layers here. This should include an
        ## embedding component, and any other variables/layers you require.

        # embedding layer
        self.embedding = tf.keras.layers.Embedding(input_dim=self.vocab_size+1, output_dim=self.embed_size)
        self.lstm = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(128, return_sequences=True))
        self.dense1 = tf.keras.layers.Dense(self.vocab_size, activation='softmax')

        # fully connected linear layers


    def call(self, inputs):
        """
        You must use an embedding layer as the first layer of your network (i.e. tf.nn.embedding_lookup or tf.keras.layers.Embedding)
        :param inputs: word ids of shape (batch_size, 2)
        :return: logits: The batch element probabilities as a tensor of shape (batch_size, vocab_size)
        """

        # embedding layer
        x = inputs
        
        x = self.embedding(x)
        x = self.lstm(x)
        x = self.dense1(x)

        
        return x


In [127]:
model = LSTM_MLM(vocab_size, 64, 20)
loss_metric = tf.keras.losses.SparseCategoricalCrossentropy(ignore_class=-1)
# accuracy is not a good measure
model.compile(loss=loss_metric, optimizer='adam')
model.fit(x=inputs, y=labels, batch_size=20, epochs=20) 
# we do not need validation because our purpose is only to learn the patterns in our training data

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x7f8abf84fdf0>

### get predicted probability

In [23]:
# make prediction: still in progress

def get_predicted_probability(masked_sentence, target_word):
    masked_sentence = masked_sentence.split(' ')
    mask_loc = masked_sentence.index('[mask]')
    target_id = vocabulary[target_word]
    query_id = [vocabulary[q] for q in masked_sentence]

    query_id = tf.expand_dims(query_id, axis=0)
    pred = model(query_id)[:,mask_loc, target_id]
    return pred


In [24]:
test_sentence = '[mask] like beauti dress'

In [132]:
get_predicted_probability(test_sentence, 'she')

<tf.Tensor: shape=(1,), dtype=float32, numpy=array([0.02857665], dtype=float32)>

In [133]:
get_predicted_probability(test_sentence, 'he')

<tf.Tensor: shape=(1,), dtype=float32, numpy=array([0.01522405], dtype=float32)>

In [185]:
get_predicted_probability(test_sentence, 'queen')

<tf.Tensor: shape=(1,), dtype=float32, numpy=array([0.00066057], dtype=float32)>

In [186]:
get_predicted_probability(test_sentence, 'king')

<tf.Tensor: shape=(1,), dtype=float32, numpy=array([3.19514e-05], dtype=float32)>

In [177]:
test_sentence = 'evil old [mask]'

In [180]:
get_predicted_probability(test_sentence, 'man')

<tf.Tensor: shape=(1,), dtype=float32, numpy=array([1.3034821e-05], dtype=float32)>

In [181]:
get_predicted_probability(test_sentence, 'woman')

<tf.Tensor: shape=(1,), dtype=float32, numpy=array([6.0170372e-05], dtype=float32)>

In [189]:
test_sentence = 'pretti [mask]'
get_predicted_probability(test_sentence, 'girl')

<tf.Tensor: shape=(1,), dtype=float32, numpy=array([2.3883402e-05], dtype=float32)>

In [190]:
get_predicted_probability(test_sentence, 'boy')

<tf.Tensor: shape=(1,), dtype=float32, numpy=array([0.00011371], dtype=float32)>

### access embedding layer:

In [192]:
embeddings = model.layers[0].get_weights()[0]
embeddings.shape

(4127, 64)

### testing lstm model on HW4 data:

In [44]:
file = open('../data/hw4_train.txt', "r")

hw4_data = file.read()
hw4_data = hw4_data.replace('\n', ' ').split(' ')

file.close()

In [49]:
hw4_vocabulary, hw4_vocab_size = get_vocab(hw4_data)

In [50]:
# convert words to vectors
hw4_vectorized_text = list(map(lambda x: hw4_vocabulary[x], hw4_data))
hw4_vectorized_text = np.array(hw4_vectorized_text)

# add [mask] to vocabulary
mask_id = vocab_size
hw4_vocabulary['[mask]'] = mask_id

# split data into sequences of length 20
hw4_vectorized_text_len = len(hw4_vectorized_text) - (len(hw4_vectorized_text) % 20)
hw4_vectorized_text = hw4_vectorized_text[:hw4_vectorized_text_len]
hw4_vectorized_text = np.reshape(hw4_vectorized_text,[-1,20])

In [52]:
hw4_inputs, hw4_labels = get_masked_inputs_labels(hw4_vectorized_text)

In [193]:
# testing model performance on hw4 data:
# model = LSTM_MLM(hw4_vocab_size, 64, 20)
# loss_metric = tf.keras.losses.SparseCategoricalCrossentropy(ignore_class=-1)
# model.compile(loss=loss_metric, optimizer='adam')
# model.fit(x=hw4_inputs, y=hw4_labels, batch_size=20, epochs=50)

## Transformers

references: "Attention Is All You Need" paper by Vaswani et al.

In [19]:
class SingleHeadAttention(tf.keras.layers.Layer):
    def __init__(self, d_model):
        super(SingleHeadAttention, self).__init__()
        self.d_model = d_model
        self.query = tf.keras.layers.Dense(d_model)
        self.key = tf.keras.layers.Dense(d_model)
        self.value = tf.keras.layers.Dense(d_model)

    def call(self, q, k, v, mask):
        q = self.query(q)
        k = self.key(k)
        v = self.value(v)
        
        dk = tf.cast(tf.shape(k)[-1], tf.float32)
        scaled_attention_logits = tf.matmul(q, k, transpose_b=True) / tf.math.sqrt(dk)
        
        if mask is not None:
            scaled_attention_logits += (mask * -1e9)
        
        attention_weights = tf.nn.softmax(scaled_attention_logits, axis=-1)
        output = tf.matmul(attention_weights, v)

        return output, attention_weights

In [20]:
class TransformerBlock(tf.keras.layers.Layer):
    def __init__(self, d_model):
        super(TransformerBlock, self).__init__()
        self.d_model = d_model
        self.att = SingleHeadAttention(d_model)
        self.ffn = tf.keras.Sequential([
            tf.keras.layers.Dense(d_model * 4, activation='relu'),
            tf.keras.layers.Dense(d_model)
        ])

        self.layernorm1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)

        self.dropout1 = tf.keras.layers.Dropout(0.1)
        self.dropout2 = tf.keras.layers.Dropout(0.1)

    def call(self, x, training, mask=None):
        attn_output, _ = self.att(x, x, x, mask)
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(x + attn_output)

        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output, training=training)
        out2 = self.layernorm2(out1 + ffn_output)

        return out2

In [21]:
class Transformer_MLM(tf.keras.Model):
    def __init__(self, vocab_size, embed_size, input_length):
        super().__init__()

        self.vocab_size = vocab_size
        self.embed_size = embed_size
        self.input_length = input_length

        self.embedding = tf.keras.layers.Embedding(input_dim=self.vocab_size+1, output_dim=self.embed_size)
        self.transformer_block = TransformerBlock(self.embed_size)
        self.dense1 = tf.keras.layers.Dense(self.vocab_size, activation='softmax')

    def call(self, inputs):
        x = inputs
        x = self.embedding(x)
        x = self.transformer_block(x, training=True)
        x = self.dense1(x)
        
        return x

In [25]:
model_t = Transformer_MLM(vocab_size, 64, 20)
loss_metric = tf.keras.losses.SparseCategoricalCrossentropy(ignore_class=-1)
model_t.compile(loss=loss_metric, optimizer='adam')
model_t.fit(x=inputs, y=labels, batch_size=20, epochs=20)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x13cf46f50>

In [26]:
def get_predicted_probability_transformer(masked_sentence, target_word):
    masked_sentence = masked_sentence.split(' ')
    mask_loc = masked_sentence.index('[mask]')
    target_id = vocabulary[target_word]
    query_id = [vocabulary[q] for q in masked_sentence]

    query_id = tf.expand_dims(query_id, axis=0)
    pred = model_t(query_id)[:,mask_loc, target_id]
    return pred

In [27]:
test_sentence = '[mask] like beauti dress'

In [38]:
get_predicted_probability_transformer(test_sentence, 'she')

<tf.Tensor: shape=(1,), dtype=float32, numpy=array([2.3778994e-05], dtype=float32)>

In [29]:
get_predicted_probability_transformer(test_sentence, 'he')

<tf.Tensor: shape=(1,), dtype=float32, numpy=array([0.00123803], dtype=float32)>

In [30]:
get_predicted_probability_transformer(test_sentence, 'queen')

<tf.Tensor: shape=(1,), dtype=float32, numpy=array([0.01000271], dtype=float32)>

In [31]:
get_predicted_probability_transformer(test_sentence, 'king')

<tf.Tensor: shape=(1,), dtype=float32, numpy=array([9.1098386e-08], dtype=float32)>