In [None]:
# Kng Yew Chian, October 2023
# Using pre-trained word embeddings and a BILSTM to classify each entity with the IO tagging scheme

In [None]:
!pip install --upgrade gensim



In [None]:
import gensim.downloader
import pickle
import numpy as np
from sklearn import metrics
import pandas as pd
import tensorflow as tf
import gc
from sklearn.preprocessing import normalize

In [None]:
#embeddings = gensim.downloader.load('word2vec-google-news-300')

In [None]:
from google.colab import drive
drive.mount('./drive')

Drive already mounted at ./drive; to attempt to forcibly remount, call drive.mount("./drive", force_remount=True).


In [None]:
#with open('./drive/MyDrive/embeddings.pkl', 'wb') as file:
#    pickle.dump(embeddings, file)

In [None]:
with open("./drive/MyDrive/embeddings.pkl", "rb") as f:
  embeddings = pickle.load(f)

# Q1.1 Based on word2vec embeddings you have downloaded, use cosine similarity to find the most similar word to each of these words: (a) “student”; (b) “Apple”; (c) “apple”. Report the most similar word and its cosine similarity.

In [None]:
print(embeddings.most_similar('student')[0])
print(embeddings.most_similar('Apple')[0])
print(embeddings.most_similar('apple')[0])

('students', 0.7294867038726807)
('Apple_AAPL', 0.7456986308097839)
('apples', 0.720359742641449)


In [None]:
student_embeddings = embeddings['student'].reshape(1,300)
students_embeddings = embeddings['students'].reshape(1,300)
print(metrics.pairwise.cosine_similarity(student_embeddings, students_embeddings))

[[0.7294867]]


processing data

In [None]:
header_names = ['word', 'useless1', 'useless2', 'tag']
train_raw = pd.read_csv("./drive/MyDrive/eng.train", header=None, sep=' ', names=header_names, skip_blank_lines=False, quotechar="|")
validation_raw = pd.read_csv("./drive/MyDrive/eng.testa", header=None, sep=' ', names=header_names, skip_blank_lines=False, quotechar="|")
test_raw = pd.read_csv("./drive/MyDrive/eng.testb", header=None, sep=' ', names=header_names, skip_blank_lines=False, quotechar="|")

In [None]:
# reads " as string instead of quote
# train_raw[train_raw['word'] == '"']

In [None]:
null_indices = train_raw[train_raw['word'].isnull()].reset_index()['index']

In [None]:
def drop_na_labels(data):
  temp = data[['word', 'tag']]
  #without_na = temp.dropna(subset=['tag']).reset_index().drop(columns='index')
  return temp

In [None]:
def get_tag_to_integer_dict(train_without_na):
  count = 0
  tag_to_integer_dictionary = {}
  for tag in train_without_na['tag'].unique():
    tag_to_integer_dictionary[tag] = count
    count += 1

  return tag_to_integer_dictionary

In [None]:
def get_tag_to_integer_dict_from_raw(train_raw):
  temp = train_raw[['word', 'tag']]
  without_na = temp.dropna(subset=['tag']).reset_index().drop(columns='index')
  tag_to_integer_dictionary = get_tag_to_integer_dict(without_na)
  return tag_to_integer_dictionary

In [None]:
tag_to_integer_dictionary = get_tag_to_integer_dict_from_raw(train_raw)

In [None]:
def get_sentences_wordstags_array(train_without_na):
  sentences_train = []
  new_sentence = []

  for i, row in train_without_na.iterrows():

    if row.isna().all():
      sentences_train.append(new_sentence)
      new_sentence = []

    else:
      if len(new_sentence) > 30:
        # if sentence length is too long, break it up every 30 words, to prevent timesteps from being too large
        new_sentence.append([row[0], row[1]])
        sentences_train.append(new_sentence)
        new_sentence = []

      else:
        new_sentence.append([row[0], row[1]])


#    if i > 10000:
#      break

  return sentences_train

In [None]:
def get_sentences_embeddingstags_array(sentences_wordstags_array, tag_to_integer_dictionary):
  # replace words with embeddings and tags with integers
  embeddings_in_sentences = []
  new_sentence = []

  count = 0
  for sentence in sentences_wordstags_array:
    for word, tag in sentence:
      if tag not in tag_to_integer_dictionary:
        #print(f"{tag} not found with {word}, skipping")
        continue
      if word not in embeddings:
        new_embedding = np.zeros(300)

      else:
        unnormalized_embedding = embeddings[word].astype(np.float32)
        # Reshape the embedding to be a 2D array with a single row
        embedding_reshaped = unnormalized_embedding.reshape(1, -1)
        # Normalize the embedding
        embedding_normalized = normalize(embedding_reshaped, axis=1, norm='l2')
        # Flatten the normalized embedding back into a 1D array
        new_embedding = embedding_normalized.flatten()

      new_sentence.append([new_embedding, tag_to_integer_dictionary[tag]])
      count += 1

    embeddings_in_sentences.append(new_sentence)
    new_sentence = []

  return embeddings_in_sentences

In [None]:
def get_without_labels(sentences_embeddingstags):
  # create copy that doesnt have the NER tag
  without_label = []
  new_sentence = []

  count = 0
  for sentence in sentences_embeddingstags:
    for embedding, tag in sentence:
      new_sentence.append(embedding)
      count += 1

    without_label.append(new_sentence)
    new_sentence = []

  return without_label

In [None]:
def get_labels(sentences_embeddingstags):
  # create labels
  labels = []
  new_sentence = []

  count = 0
  for sentence in sentences_embeddingstags:
    for embedding, tag in sentence:
      new_sentence.append(tag)
      count += 1

    labels.append(np.array(new_sentence))
    new_sentence = []

  return labels

In [None]:
def pad_without_labels_and_labels(without_labels, labels, max_sentence_length):
  # padding
  print("padding: ", max_sentence_length)
  padded = tf.keras.utils.pad_sequences(without_labels, padding="post", dtype="float32", maxlen=max_sentence_length, value=0)
  padded_labels = tf.keras.utils.pad_sequences(labels, padding="post", maxlen=max_sentence_length, value=999)

  return padded, padded_labels

In [None]:
def convert_padded_and_padded_labels_to_np_arrays(padded, padded_labels):
  # input: [batch, timestep, feature]
  padded_np = np.array(padded)
  padded_labels_np = np.array(padded_labels)
  """
  print(type(train_labels))
  print(type(train_labels[0]))
  print(type(train_labels[0][0]))
  print(type(train))
  print(type(train[0]))
  print(type(train[0][0]))
  print(type(train[0][0][0]))
  """
  return padded_np, padded_labels_np



In [None]:
def process_raw_to_input_and_labels(raw, tag_to_integer_dictionary):
  without_na = drop_na_labels(raw)
  num_classes = len(tag_to_integer_dictionary)
  sentences_wordstags = get_sentences_wordstags_array(without_na)
  sentences_embeddingstags = get_sentences_embeddingstags_array(sentences_wordstags, tag_to_integer_dictionary)
  without_labels = get_without_labels(sentences_embeddingstags)
  labels = get_labels(sentences_embeddingstags)
  """
  del without_na
  del sentences_wordstags
  del sentences_embeddingstags

  gc.collect()
  """

  return without_labels, labels, tag_to_integer_dictionary, num_classes

In [None]:
def pad_input_and_labels(without_labels, labels, max_sentence_length):
  padded, padded_labels = pad_without_labels_and_labels(without_labels, labels, max_sentence_length)
  padded_np, padded_labels_np = convert_padded_and_padded_labels_to_np_arrays(padded, padded_labels)

  return padded_np, padded_labels_np

In [None]:
unpadded_train, unpadded_train_labels, tag_to_integer_dictionary, num_classes = process_raw_to_input_and_labels(train_raw, tag_to_integer_dictionary)
unpadded_val, unpadded_val_labels, unused_val_dictionary, unused_val_num_classes = process_raw_to_input_and_labels(train_raw, tag_to_integer_dictionary)
unpadded_test, unpadded_test_labels, unused_test_dictionary, unused_test_num_classes = process_raw_to_input_and_labels(test_raw, tag_to_integer_dictionary)

max_sentence_length = 0
for sentence in unpadded_train:
  if len(sentence) > max_sentence_length:
    max_sentence_length = len(sentence)
for sentence in unpadded_val:
  if len(sentence) > max_sentence_length:
    max_sentence_length = len(sentence)
for sentence in unpadded_test:
  if len(sentence) > max_sentence_length:
    max_sentence_length = len(sentence)

train, train_labels = pad_input_and_labels(unpadded_train, unpadded_train_labels, max_sentence_length)
val, val_labels = pad_input_and_labels(unpadded_val, unpadded_val_labels, max_sentence_length)
test, test_labels = pad_input_and_labels(unpadded_test, unpadded_test_labels, max_sentence_length)

padding:  32
padding:  32
padding:  32


In [None]:
np.isnan(train).any()

False

In [None]:
print(train.shape)
print(test.shape)

(826, 32, 300)
(941, 32, 300)


#Q1.2(a) Describe the size (number of sentences) of the training, development and test file for CoNLL2003. Specify the complete set of all possible word labels based on the tagging scheme (IO, BIO, etc.) you chose.

In [None]:
print(f"Number of Sentences in train: {train.shape[0]}")
print(f"Number of Sentences in development: {val.shape[0]}")
print(f"Number of Sentences in test: {test.shape[0]}")
print(f"Possible word labels:", end=" ")
for key in tag_to_integer_dictionary:
  print(key, end=", ")

Number of Sentences in train: 826
Number of Sentences in development: 826
Number of Sentences in test: 941
Possible word labels: I-ORG, O, I-MISC, I-PER, I-LOC, B-LOC, B-MISC, B-ORG, 

#Q1.2(b) Choose an example sentence from the training set of CoNLL2003 that has at least two named entities with more than one word. Explain how to form complete named entities from the label for each word, and list all the named entities in this sentence.

In [None]:
# Example Sentence
"""
Germany I-LOC
's O
representative O
to O
the O
European I-ORG
Union I-ORG
's O
veterinary O
committee O
Werner I-PER
Zwingmann I-PER
said O
on O
Wednesday O
consumers O
should O
buy O
sheepmeat O
from O
countries O
other O
than O
Britain I-LOC
until O
the O
scientific O
advice O
was O
clearer O
. O
"""
# When t

"\nGermany I-LOC\n's O\nrepresentative O\nto O\nthe O\nEuropean I-ORG\nUnion I-ORG\n's O\nveterinary O\ncommittee O\nWerner I-PER\nZwingmann I-PER\nsaid O\non O\nWednesday O\nconsumers O\nshould O\nbuy O\nsheepmeat O\nfrom O\ncountries O\nother O\nthan O\nBritain I-LOC\nuntil O\nthe O\nscientific O\nadvice O\nwas O\nclearer O\n. O\n"

Construct model

In [None]:
import keras
from keras import layers

In [None]:
inputs = keras.Input(shape = (train.shape[1], train.shape[2]))
x = layers.Masking(mask_value=0.0)(inputs)
x = layers.Bidirectional(layers.LSTM(16, return_sequences=True))(x)
outputs = layers.Dense(num_classes, activation="softmax")(x)
model = keras.Model(inputs, outputs)
model.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 32, 300)]         0         
                                                                 
 masking (Masking)           (None, 32, 300)           0         
                                                                 
 bidirectional (Bidirection  (None, 32, 32)            40576     
 al)                                                             
                                                                 
 dense (Dense)               (None, 32, 8)             264       
                                                                 
Total params: 40840 (159.53 KB)
Trainable params: 40840 (159.53 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [None]:
from keras.callbacks import Callback
from sklearn.metrics import f1_score, precision_score, recall_score

from tensorflow.keras.callbacks import Callback
from sklearn.metrics import f1_score, precision_score, recall_score
import numpy as np

class F1ScoreCallback(Callback):
    def __init__(self, train_data, train_labels, val_data, val_labels, mask_value=999):
        super(F1ScoreCallback, self).__init__()
        self.train_data = train_data
        self.train_labels = train_labels
        self.val_data = val_data
        self.val_labels = val_labels
        self.mask_value = mask_value

    def on_epoch_end(self, epoch, logs={}):
        # Predictions for the training set
        train_softmaxed_outputs = self.model.predict(self.train_data)
        train_predicted_indices = np.argmax(train_softmaxed_outputs, axis=-1)

        # Mask the training predictions and labels
        train_mask = (self.train_labels != self.mask_value)
        filtered_train_pred = train_predicted_indices[train_mask]
        filtered_train_true = self.train_labels[train_mask]

        # Predictions for the validation set
        val_softmaxed_outputs = self.model.predict(self.val_data)
        val_predicted_indices = np.argmax(val_softmaxed_outputs, axis=-1)

        # Mask the validation predictions and labels
        val_mask = (self.val_labels != self.mask_value)
        filtered_val_pred = val_predicted_indices[val_mask]
        filtered_val_true = self.val_labels[val_mask]

        # Calculate the metrics
        train_precision = precision_score(filtered_train_true, filtered_train_pred, average='weighted', labels=np.unique(filtered_train_pred))
        train_recall = recall_score(filtered_train_true, filtered_train_pred, average='weighted', labels=np.unique(filtered_train_pred))
        train_f1 = f1_score(filtered_train_true, filtered_train_pred, average='weighted', labels=np.unique(filtered_train_pred))

        val_precision = precision_score(filtered_val_true, filtered_val_pred, average='weighted', labels=np.unique(filtered_val_pred))
        val_recall = recall_score(filtered_val_true, filtered_val_pred, average='weighted', labels=np.unique(filtered_val_pred))
        val_f1 = f1_score(filtered_val_true, filtered_val_pred, average='weighted', labels=np.unique(filtered_val_pred))

        # Print the metrics
        print(f'\nEpoch {epoch + 1}')
        print(f'Training Precision: {train_precision:.4f} | Training Recall: {train_recall:.4f} | Training F1: {train_f1:.4f}')
        print(f'Validation Precision: {val_precision:.4f} | Validation Recall: {val_recall:.4f} | Validation F1: {val_f1:.4f}')

# Then, create an instance of the F1ScoreCallback
f1_score_callback = F1ScoreCallback(train_data=train, train_labels=train_labels, val_data=val, val_labels=val_labels)

In [None]:
def masked_loss_function(y_true, y_pred):
  """
  mask = tf.cast(tf.not_equal(y_true, 999), tf.float32)
  tf.print("y_true: ", y_true)
  tf.print("mask: ", mask)
  #tf.print(y_true)
  tf.print(y_pred[0][0][:])
  tf.print(len(y_pred))
  tf.print(len(y_pred[0]))
  tf.print(len(y_pred[0][0]))
  loss = tf.keras.losses.sparse_categorical_crossentropy(y_true, y_pred)
  loss *= mask
  #tf.print(tf.reduce_sum(loss) / tf.reduce_sum(mask))
  return tf.reduce_sum(loss) / tf.reduce_sum(mask)
  """
  # Create a mask to ignore the loss for 999 values in y_true
  mask = tf.cast(tf.not_equal(y_true, 999), tf.float32)

  # Replace the 999 values with a valid class index (e.g., 0)
  y_true_masked = tf.where(tf.not_equal(y_true, 999), y_true, 0)

  # Calculate the loss using the modified y_true
  loss = tf.keras.losses.sparse_categorical_crossentropy(y_true_masked, y_pred)

  # Apply the mask to zero-out the loss for originally masked values
  loss *= mask

  # Return the mean loss only for the unmasked elements
  return tf.reduce_sum(loss) / tf.reduce_sum(mask)

In [None]:
learning_rate = 0.001
adam_optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate, clipvalue=0.5)

In [None]:
model.compile(optimizer=adam_optimizer, loss=masked_loss_function, metrics=["accuracy"])
#model.fit(train, train_labels, batch_size=32, epochs=2)
model.fit(train, train_labels, batch_size=32, epochs=10, validation_data=(val, val_labels), callbacks=[f1_score_callback])

Epoch 1/10

Epoch 1
Training Precision: 0.7772 | Training Recall: 1.0000 | Training F1: 0.8746
Validation Precision: 0.7772 | Validation Recall: 1.0000 | Validation F1: 0.8746
Epoch 2/10

Epoch 2
Training Precision: 0.7772 | Training Recall: 1.0000 | Training F1: 0.8746
Validation Precision: 0.7772 | Validation Recall: 1.0000 | Validation F1: 0.8746
Epoch 3/10

Epoch 3
Training Precision: 0.7882 | Training Recall: 0.8996 | Training F1: 0.7915
Validation Precision: 0.7882 | Validation Recall: 0.8996 | Validation F1: 0.7915
Epoch 4/10

Epoch 4
Training Precision: 0.8009 | Training Recall: 0.9114 | Training F1: 0.8162
Validation Precision: 0.8009 | Validation Recall: 0.9114 | Validation F1: 0.8162
Epoch 5/10

Epoch 5
Training Precision: 0.8219 | Training Recall: 0.9411 | Training F1: 0.8639
Validation Precision: 0.8219 | Validation Recall: 0.9411 | Validation F1: 0.8639
Epoch 6/10

Epoch 6
Training Precision: 0.8446 | Training Recall: 0.8980 | Training F1: 0.8314
Validation Precision: 0.8

<keras.src.callbacks.History at 0x792c60256f50>

In [None]:
# Testing layer outputs
from tensorflow import keras
from keras import layers
import numpy as np

# Create a new model that will return the outputs from all layers:
layer_outputs = [layer.output for layer in model.layers]  # Exclude the Input layer if necessary
activation_model = keras.Model(inputs=model.input, outputs=layer_outputs)

# Get the outputs for an input:
all_layer_activations = activation_model.predict(train)

# Now iterate over the outputs and check for NaNs:
for layer_activation in all_layer_activations:
    # Check if the activation contains NaNs
    if np.isnan(layer_activation).any():
        print("NaNs detected")

# If you want to check a particular layer by name, you can do:
for layer, activation in zip(model.layers, all_layer_activations):
    if np.isnan(activation).any():
        print(f"NaN detected in layer: {layer.name}")





In [None]:
test_softmaxed_outputs = model.predict(test)



In [None]:
print((test_labels[0]))
print((test_softmaxed_outputs[0]))

[  1   1   4   1   1   1   1   3   1   1   1   1 999 999 999 999 999 999
 999 999 999 999 999 999 999 999 999 999 999 999 999 999]
[[6.77952170e-02 7.18678176e-01 5.77540472e-02 6.40887842e-02
  7.50453919e-02 4.02428024e-03 4.70536947e-03 7.90871773e-03]
 [1.18871830e-01 1.74318880e-01 1.16742827e-01 1.19406223e-01
  1.18349358e-01 1.17174484e-01 1.17637008e-01 1.17499359e-01]
 [7.25751817e-02 6.33530796e-01 9.14234146e-02 3.71440798e-02
  1.50167182e-01 3.44907516e-03 3.81101016e-03 7.89929740e-03]
 [1.32251373e-02 9.47483242e-01 1.19246030e-02 5.75576583e-03
  1.90102775e-02 5.18397137e-04 7.81384064e-04 1.30122853e-03]
 [1.57218892e-02 9.27361488e-01 1.91534255e-02 8.04603286e-03
  2.75467262e-02 5.21303911e-04 6.23351079e-04 1.02580211e-03]
 [1.67402029e-02 9.25362170e-01 1.94285288e-02 7.02356873e-03
  2.93709766e-02 5.30447694e-04 5.89191623e-04 9.54823219e-04]
 [1.18871830e-01 1.74318880e-01 1.16742827e-01 1.19406223e-01
  1.18349358e-01 1.17174484e-01 1.17637008e-01 1.17499359

Evaluation

In [None]:
!pip install seqeval



In [None]:
idx_to_label = {idx: label for label, idx in tag_to_integer_dictionary.items()}
predicted_indices = np.argmax(test_softmaxed_outputs, axis=-1)

In [None]:
from sklearn.metrics import precision_score, recall_score, f1_score

# Assuming predicted_indices and test_labels are numpy arrays
predicted_indices = np.array(predicted_indices)
test_labels = np.array(test_labels)

# Flatten the arrays
predicted_flat = predicted_indices.flatten()
labels_flat = test_labels.flatten()

# Filter out the padding values (999)
mask = labels_flat != 999
filtered_predictions = predicted_flat[mask]
filtered_true_labels = labels_flat[mask]

# Calculate precision, recall, and F1 score
precision = precision_score(filtered_true_labels, filtered_predictions, average='weighted')
recall = recall_score(filtered_true_labels, filtered_predictions, average='weighted')
f1 = f1_score(filtered_true_labels, filtered_predictions, average='weighted')

print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1 Score: {f1}')

Precision: 0.7757747382874197
Recall: 0.8582400702216371
F1 Score: 0.812918042114928


  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
filtered_label_predictions = [idx_to_label[index] for index in filtered_predictions]
filtered_label_actual = [idx_to_label[index] for index in filtered_true_labels]
for i in range(50):
  print("pred: ", filtered_label_predictions[i], " actual: ", filtered_label_actual[i])


pred:  O  actual:  O
pred:  O  actual:  O
pred:  O  actual:  I-LOC
pred:  O  actual:  O
pred:  O  actual:  O
pred:  O  actual:  O
pred:  O  actual:  O
pred:  O  actual:  I-PER
pred:  O  actual:  O
pred:  O  actual:  O
pred:  O  actual:  O
pred:  O  actual:  O
pred:  I-PER  actual:  I-PER
pred:  I-PER  actual:  I-PER
pred:  O  actual:  I-LOC
pred:  O  actual:  O
pred:  I-PER  actual:  I-LOC
pred:  I-LOC  actual:  I-LOC
pred:  I-LOC  actual:  I-LOC
pred:  O  actual:  O
pred:  I-LOC  actual:  I-LOC
pred:  O  actual:  O
pred:  O  actual:  O
pred:  O  actual:  O
pred:  O  actual:  O
pred:  O  actual:  O
pred:  O  actual:  I-MISC
pred:  O  actual:  I-MISC
pred:  O  actual:  O
pred:  O  actual:  O
pred:  O  actual:  O
pred:  O  actual:  O
pred:  O  actual:  O
pred:  O  actual:  O
pred:  O  actual:  O
pred:  I-LOC  actual:  I-LOC
pred:  O  actual:  O
pred:  O  actual:  O
pred:  O  actual:  O
pred:  O  actual:  O
pred:  O  actual:  O
pred:  O  actual:  O
pred:  O  actual:  O
pred:  O  actual:  

In [None]:
issues:
1. cant use timesteps that are too high as nan values would surface
2. cant use all the data as theres not enough ram
3. problems visualising the code

SyntaxError: ignored

#Q1.3(a) Discuss how you deal with new words in the training set which are not found in the pretrained dictionary. Likewise, how do you deal with new words in the test set which are not found in either the pretrained dictionary or the training set? Show the corresponding code snippet

In [None]:
# For all new words not found in the pre-trained dictionary, a numpy array of size 300 that contains zeros is used in place of the usual array of size 300 pre-trained values
"""
if word not in embeddings:
  new_embedding = np.zeros(300)
"""
# This method is simple but effective and widely used in NER tasks where there are missing embedding values

#Q1.3(b) Describe what neural network you used to produce the final vector representation of each word and what are the mathematical functions used for the forward computation (i.e., from the pretrained word vectors to the final label of each word). Give the detailed setting of the network including which parameters are being updated, what are their sizes, and what is the length of the final vector representation of each word to be fed to the softmax classifier.

The neural network transforms pre-trained word vectors into categorical labels through a bidirectional LSTM and dense layers.

Components:

Input Layer: Receives input of shape (None, maximum sentence length(words), word2vec embedding dimensions). Note the maximum sentence length is also the timesteps in this context.

Masking Layer: Applies a mask to the input where any timestep with a value of zero is ignored, preventing padding from affecting the subsequent layers' computations.

Bidirectional LSTM Layer: This layer utilizes two LSTM layers that process the data in both forward and reverse directions. Each LSTM layer comprises 16 units and implements the following mathematical operations:

Input Gate: i_t = σ(W_i*[h_{t-1}, x_t] + b_i

Forget Gate: f_t = σ(W_f*[h_{t-1}, x_t] + b_f)

Output Gate: o_t = σ(W_o*[h_{t-1}, x_t] + b_o)

Cell State: c_t = f_t * c_{t-1} + i_t * tanh(W_c*[h_{t-1}, x_t] + b_c)

Hidden State: h_t = o_t * tanh(c_t)

Where σ denotes the sigmoid function, tanh is the hyperbolic tangent activation function, W and b are the weights and biases of the respective gates, and * denotes element-wise multiplication. The bidirectional wrapper concatenates the outputs from both directions for each timestep, resulting in a 32-dimensional vector.

Dense Output Layer: Applies a linear transformation followed by a softmax activation to the LSTM outputs to obtain the probability distribution over classes:

Linear Transformation: z = W_d * h + b_d
Softmax Activation: softmax(z) = exp(z_i) / Σexp(z_j) for i = 1 to num_classes
Here, W_d and b_d are the weights and biases of the Dense layer, and h is the output from the Bidirectional LSTM.

Training Details:

Loss Function: Utilizes a custom masked_loss_function which computes the cross-entropy loss for unmasked timesteps while excluding the effects of timesteps with a label of 999.

Optimizer: Employs the Adam optimizer with a specified learning rate and gradient clipping to prevent the adverse effects of large gradient updates.

Training Process: The model is compiled and trained over a specified number of epochs with batch-based updates, where parameters are adjusted to minimize the custom loss function, with accuracy serving as the performance metric.

Final Vector Representation:

Before classification, each word is represented by a vector of length equal to num_classes, which is the output of the dense layer. This vector encodes the probability of each class given the context of the word as understood by the bidirectional LSTM.

#Q1.3(c) Report how many epochs you used for training, as well as the running time.

epochs = 10