In [10]:
import tensorflow as tf
import numpy as np
import pickle

In [11]:
# get preprocessed data:
train_file = '../preprocessed_texts.txt'
file = open(train_file, "r")

train_data = file.read()
train_data = train_data.split(' ')

file.close()


In [12]:
# get vocabulary:
with open('../vocabulary.pkl', 'rb') as fp:
    vocabulary = pickle.load(fp)
    
vocab_size = len(vocabulary)

## Bi-directional LSTM Masked Language Modeling

references: 

https://keras.io/examples/nlp/masked_language_modeling/#create-bert-model-pretraining-model-for-masked-language-modeling

https://www.kaggle.com/code/ritvik1909/masked-language-modelling-rnn#Data-Preparation

https://keras.io/examples/nlp/bidirectional_lstm_imdb/

questions:
- should we split data by sentence instead of by fixed window size of 20?


### more data preparation

In [13]:
# convert words to vectors
vectorized_text = list(map(lambda x: vocabulary[x], train_data))
vectorized_text = np.array(vectorized_text)

# add [mask] to vocabulary
mask_id = vocab_size
vocabulary['[mask]'] = mask_id

# split data into sequences of length 20
vectorized_text_len = len(vectorized_text) - (len(vectorized_text) % 20)
vectorized_text = vectorized_text[:vectorized_text_len]
vectorized_text = np.reshape(vectorized_text,[-1,20])

In [14]:
vectorized_text

array([[4556,  986, 4556, ..., 1696, 4015,    0],
       [ 718, 4250, 3636, ...,    0, 4556, 1095],
       [   0, 4556, 4556, ..., 1280, 4556, 4556],
       ...,
       [1533,  822, 2609, ..., 1954, 1778, 1731],
       [1449, 2609,    0, ..., 4556, 2856, 2622],
       [4580,    0,  349, ..., 4309, 4556,  165]])

In [15]:
def mask_one_input_label(sequence):
    
    # randomly choose one position in sequence to mask
    mask = np.random.randint(low=0, high=20)
    
    # add mask to input
    masked_input = [token if i != mask else mask_id for i, token in enumerate(sequence)]
    
    # set all values in label to -1(ignored by loss function) except the value at the masked position
    label = [-1 if i!= mask else token for i, token in enumerate(sequence)]
    return masked_input, label


In [16]:
# get masked inputs and labels
def get_masked_inputs_labels(text):
    inputs = []
    labels = []

    for seq in text:
        x,y = mask_one_input_label(seq)
        inputs.append(x)
        labels.append(y)
    inputs = np.array(inputs)
    labels = np.array(labels)
    
    return inputs, labels


In [17]:
inputs, labels = get_masked_inputs_labels(vectorized_text)

In [18]:
print(inputs[0], labels[0])

[4556  986 4556 5001 4556 3012    0 4556 1965  846 4641 1398 3772 3232
 2543 1061    0 1696 4015    0] [ -1  -1  -1 389  -1  -1  -1  -1  -1  -1  -1  -1  -1  -1  -1  -1  -1  -1
  -1  -1]


### bi-directional lstm model building and training

In [9]:
# define masked language modeling class
class LSTM_MLM(tf.keras.Model):
    def __init__(self, vocab_size, embed_size, input_length):
        """
        The Model class predicts the next words in a sequence.
        : param vocab_size : The number of unique words in the data
        : param hidden_size   : The size of your desired RNN
        : param embed_size : The size of your latent embedding
        """

        super().__init__()

        self.vocab_size = vocab_size
        self.embed_size = embed_size
        self.input_length = input_length

        ## TODO: define your trainable variables and/or layers here. This should include an
        ## embedding component, and any other variables/layers you require.

        # embedding layer
        self.embedding = tf.keras.layers.Embedding(input_dim=self.vocab_size+1, output_dim=self.embed_size)
        self.lstm = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(128, return_sequences=True))
        self.dense1 = tf.keras.layers.Dense(self.vocab_size, activation='softmax')

        # fully connected linear layers


    def call(self, inputs):
        """
        You must use an embedding layer as the first layer of your network (i.e. tf.nn.embedding_lookup or tf.keras.layers.Embedding)
        :param inputs: word ids of shape (batch_size, 2)
        :return: logits: The batch element probabilities as a tensor of shape (batch_size, vocab_size)
        """

        # embedding layer
        x = inputs
        
        x = self.embedding(x)
        x = self.lstm(x)
        x = self.dense1(x)

        
        return x


In [12]:
model = LSTM_MLM(vocab_size, 64, 20)
loss_metric = tf.keras.losses.SparseCategoricalCrossentropy(ignore_class=-1)
# accuracy is not a good measure
model.compile(loss=loss_metric, optimizer='adam')
model.fit(x=inputs, y=labels, batch_size=100, epochs=20) 


Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x7fc7cb52a890>

In [14]:
embeddings = model.layers[0].get_weights()[0]

In [16]:
embeddings.shape

(5002, 64)

In [18]:
np.savetxt("bidirectional_lstm_embedding.csv", embeddings, delimiter=",")

In [13]:
model.save("bi_lstm")



INFO:tensorflow:Assets written to: bi_lstm/assets


INFO:tensorflow:Assets written to: bi_lstm/assets


In [22]:
# load model:
bi_lstm_model = tf.keras.models.load_model("bi_lstm")

In [41]:
bi_lstm_model(inputs[:1])

<tf.Tensor: shape=(1, 20, 5001), dtype=float32, numpy=
array([[[1.08420591e-05, 4.81920279e-06, 2.71167823e-06, ...,
         5.65170876e-06, 3.07116788e-05, 1.06291656e-04],
        [6.13006979e-09, 1.24815851e-04, 2.29068576e-07, ...,
         7.75018416e-05, 7.67877282e-06, 1.62149081e-06],
        [1.61006656e-02, 2.63795243e-07, 1.84801465e-07, ...,
         3.70968991e-07, 5.22544324e-05, 1.11797908e-05],
        ...,
        [8.99803638e-07, 2.04359094e-05, 1.09520137e-04, ...,
         2.73105870e-05, 1.16686970e-05, 1.19550816e-06],
        [2.74908915e-03, 1.22807176e-09, 9.74802097e-05, ...,
         1.06047260e-09, 2.86014483e-08, 1.22358079e-08],
        [9.74070531e-07, 3.86170897e-04, 1.95058037e-05, ...,
         3.52743955e-04, 3.49714799e-04, 5.24173129e-06]]], dtype=float32)>

In [29]:
model

<__main__.LSTM_MLM at 0x7fc7e44d5120>

### get predicted probability

In [31]:
# make prediction

def get_predicted_probability(masked_sentence, target_word, model):
    masked_sentence = masked_sentence.split(' ')
    mask_loc = masked_sentence.index('[mask]')
    target_id = vocabulary[target_word]
    query_id = [vocabulary[q] for q in masked_sentence]
    

    query_id = tf.expand_dims(query_id, axis=0)
    #print(query_id.shape, query_id)
    pred = model(query_id, training=False)[:,mask_loc, target_id]
    return pred


In [59]:
test_sentence = '[mask] like beautiful dress'

In [70]:
get_predicted_probability(test_sentence, 'she', bi_lstm_model)

ValueError: Exception encountered when calling layer 'lstm_mlm_1' (type LSTM_MLM).

Could not find matching concrete function to call loaded from the SavedModel. Got:
  Positional arguments (1 total):
    * <tf.Tensor 'inputs:0' shape=(1, 2) dtype=int32>
  Keyword arguments: {'training': False}

 Expected these arguments to match one of the following 4 option(s):

Option 1:
  Positional arguments (1 total):
    * TensorSpec(shape=(None, 20), dtype=tf.int64, name='inputs')
  Keyword arguments: {'training': False}

Option 2:
  Positional arguments (1 total):
    * TensorSpec(shape=(None, 20), dtype=tf.int64, name='inputs')
  Keyword arguments: {'training': True}

Option 3:
  Positional arguments (1 total):
    * TensorSpec(shape=(None, 20), dtype=tf.int64, name='input_1')
  Keyword arguments: {'training': False}

Option 4:
  Positional arguments (1 total):
    * TensorSpec(shape=(None, 20), dtype=tf.int64, name='input_1')
  Keyword arguments: {'training': True}

Call arguments received by layer 'lstm_mlm_1' (type LSTM_MLM):
  • args=('tf.Tensor(shape=(1, 2), dtype=int32)',)
  • kwargs=<class 'inspect._empty'>

In [61]:
get_predicted_probability(test_sentence, 'he', model)

<tf.Tensor: shape=(1,), dtype=float32, numpy=array([0.00055029], dtype=float32)>

In [62]:
get_predicted_probability(test_sentence, 'queen', model)

<tf.Tensor: shape=(1,), dtype=float32, numpy=array([0.00154718], dtype=float32)>

In [63]:
get_predicted_probability(test_sentence, 'king', model)

<tf.Tensor: shape=(1,), dtype=float32, numpy=array([1.7876113e-05], dtype=float32)>

In [64]:
test_sentence = 'evil old [mask]'

In [65]:
get_predicted_probability(test_sentence, 'man', model)

<tf.Tensor: shape=(1,), dtype=float32, numpy=array([0.06361172], dtype=float32)>

In [66]:
get_predicted_probability(test_sentence, 'woman', model)

<tf.Tensor: shape=(1,), dtype=float32, numpy=array([0.19437508], dtype=float32)>

In [67]:
test_sentence = 'pretty [mask]'
get_predicted_probability(test_sentence, 'girl', model)

<tf.Tensor: shape=(1,), dtype=float32, numpy=array([0.0031174], dtype=float32)>

In [68]:
get_predicted_probability(test_sentence, 'boy', model)

<tf.Tensor: shape=(1,), dtype=float32, numpy=array([0.0016107], dtype=float32)>

### access embedding layer:

In [192]:
embeddings = model.layers[0].get_weights()[0]
embeddings.shape

(4127, 64)

### testing lstm model on HW4 data:

In [44]:
file = open('../data/hw4_train.txt', "r")

hw4_data = file.read()
hw4_data = hw4_data.replace('\n', ' ').split(' ')

file.close()

In [49]:
hw4_vocabulary, hw4_vocab_size = get_vocab(hw4_data)

In [50]:
# convert words to vectors
hw4_vectorized_text = list(map(lambda x: hw4_vocabulary[x], hw4_data))
hw4_vectorized_text = np.array(hw4_vectorized_text)

# add [mask] to vocabulary
mask_id = vocab_size
hw4_vocabulary['[mask]'] = mask_id

# split data into sequences of length 20
hw4_vectorized_text_len = len(hw4_vectorized_text) - (len(hw4_vectorized_text) % 20)
hw4_vectorized_text = hw4_vectorized_text[:hw4_vectorized_text_len]
hw4_vectorized_text = np.reshape(hw4_vectorized_text,[-1,20])

In [52]:
hw4_inputs, hw4_labels = get_masked_inputs_labels(hw4_vectorized_text)

In [193]:
# testing model performance on hw4 data:
# model = LSTM_MLM(hw4_vocab_size, 64, 20)
# loss_metric = tf.keras.losses.SparseCategoricalCrossentropy(ignore_class=-1)
# model.compile(loss=loss_metric, optimizer='adam')
# model.fit(x=hw4_inputs, y=hw4_labels, batch_size=20, epochs=50)

## Transformers

references: "Attention Is All You Need" paper by Vaswani et al.

In [19]:
class SingleHeadAttention(tf.keras.layers.Layer):
    def __init__(self, d_model):
        super(SingleHeadAttention, self).__init__()
        self.d_model = d_model
        self.query = tf.keras.layers.Dense(d_model)
        self.key = tf.keras.layers.Dense(d_model)
        self.value = tf.keras.layers.Dense(d_model)

    def call(self, q, k, v, mask):
        q = self.query(q)
        k = self.key(k)
        v = self.value(v)
        
        dk = tf.cast(tf.shape(k)[-1], tf.float32)
        scaled_attention_logits = tf.matmul(q, k, transpose_b=True) / tf.math.sqrt(dk)
        
        if mask is not None:
            scaled_attention_logits += (mask * -1e9)
        
        attention_weights = tf.nn.softmax(scaled_attention_logits, axis=-1)
        output = tf.matmul(attention_weights, v)

        return output, attention_weights

In [20]:
class TransformerBlock(tf.keras.layers.Layer):
    def __init__(self, d_model):
        super(TransformerBlock, self).__init__()
        self.d_model = d_model
        self.att = SingleHeadAttention(d_model)
        self.ffn = tf.keras.Sequential([
            tf.keras.layers.Dense(d_model * 4, activation='relu'),
            tf.keras.layers.Dense(d_model)
        ])

        self.layernorm1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)

        self.dropout1 = tf.keras.layers.Dropout(0.1)
        self.dropout2 = tf.keras.layers.Dropout(0.1)

    def call(self, x, training, mask=None):
        attn_output, _ = self.att(x, x, x, mask)
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(x + attn_output)

        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output, training=training)
        out2 = self.layernorm2(out1 + ffn_output)

        return out2

In [21]:
class Transformer_MLM(tf.keras.Model):
    def __init__(self, vocab_size, embed_size, input_length):
        super().__init__()

        self.vocab_size = vocab_size
        self.embed_size = embed_size
        self.input_length = input_length

        self.embedding = tf.keras.layers.Embedding(input_dim=self.vocab_size+1, output_dim=self.embed_size)
        self.transformer_block = TransformerBlock(self.embed_size)
        self.dense1 = tf.keras.layers.Dense(self.vocab_size, activation='softmax')

    def call(self, inputs):
        x = inputs
        x = self.embedding(x)
        x = self.transformer_block(x, training=True)
        x = self.dense1(x)
        
        return x

In [23]:
model_t = Transformer_MLM(vocab_size, 64, 20)
loss_metric = tf.keras.losses.SparseCategoricalCrossentropy(ignore_class=-1)
model_t.compile(loss=loss_metric, optimizer='adam')
model_t.fit(x=inputs, y=labels, batch_size=100, epochs=20)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x13723ead0>

In [24]:
embeddings_t = model_t.layers[0].get_weights()[0]

In [25]:
embeddings_t.shape

(5002, 64)

In [27]:
np.savetxt("transformer_embedding.csv", embeddings_t, delimiter=",")

In [28]:
model_t.save("transformer")



INFO:tensorflow:Assets written to: transformer/assets


INFO:tensorflow:Assets written to: transformer/assets


In [29]:
# load model:
transformer_model = tf.keras.models.load_model("transformer")

In [30]:
transformer_model(inputs[:1])

<tf.Tensor: shape=(1, 20, 5001), dtype=float32, numpy=
array([[[1.10627651e-01, 3.65460156e-20, 9.09277631e-09, ...,
         3.88752805e-13, 4.70511163e-13, 3.14420430e-15],
        [1.12201355e-01, 2.67181530e-18, 1.92025240e-07, ...,
         7.16952149e-12, 1.09371312e-12, 3.39503217e-11],
        [8.50980505e-02, 9.32267641e-20, 2.36657954e-07, ...,
         1.14831940e-12, 9.01196045e-12, 9.00208575e-15],
        ...,
        [1.26456693e-01, 8.70563715e-20, 7.03296932e-09, ...,
         8.52647976e-13, 1.41830514e-12, 1.28253677e-13],
        [8.51022676e-02, 1.68086647e-16, 2.43739305e-06, ...,
         2.54270882e-10, 6.30315678e-12, 4.46802231e-12],
        [7.29921013e-02, 5.40546146e-15, 2.71985555e-05, ...,
         4.43284826e-10, 1.69307395e-08, 1.03836890e-10]]], dtype=float32)>

In [34]:
test_sentence = '[mask] like beautiful dress'

In [36]:
get_predicted_probability(test_sentence, 'she', model_t)

<tf.Tensor: shape=(1,), dtype=float32, numpy=array([0.00011386], dtype=float32)>

In [37]:
get_predicted_probability(test_sentence, 'he', model_t)

<tf.Tensor: shape=(1,), dtype=float32, numpy=array([0.0017492], dtype=float32)>

In [38]:
get_predicted_probability(test_sentence, 'queen', model_t)

<tf.Tensor: shape=(1,), dtype=float32, numpy=array([0.09520493], dtype=float32)>

In [39]:
get_predicted_probability(test_sentence, 'king', model_t)

<tf.Tensor: shape=(1,), dtype=float32, numpy=array([0.00025578], dtype=float32)>

In [40]:
test_sentence_evil = 'evil old [mask]'

In [43]:
get_predicted_probability(test_sentence_evil, 'man', model_t)

<tf.Tensor: shape=(1,), dtype=float32, numpy=array([0.02564601], dtype=float32)>

In [44]:
get_predicted_probability(test_sentence_evil, 'woman', model_t)

<tf.Tensor: shape=(1,), dtype=float32, numpy=array([0.04206395], dtype=float32)>

In [69]:
test_brave = 'brave [mask]'
get_predicted_probability(test_brave, 'woman', model_t)

<tf.Tensor: shape=(1,), dtype=float32, numpy=array([6.268294e-05], dtype=float32)>

In [70]:
get_predicted_probability(test_brave, 'man', model_t)

<tf.Tensor: shape=(1,), dtype=float32, numpy=array([0.03903314], dtype=float32)>

In [71]:
test_power = 'powerful [mask]'
get_predicted_probability(test_power, 'woman', model_t)

<tf.Tensor: shape=(1,), dtype=float32, numpy=array([6.4778555e-06], dtype=float32)>

In [72]:
get_predicted_probability(test_brave, 'man', model_t)

<tf.Tensor: shape=(1,), dtype=float32, numpy=array([0.01902533], dtype=float32)>