# Assignment 1

**Due to**: 11/01/2022 (dd/mm/yyyy)

If you deliver it by 11/12/2021 your assignment will be graded by 11/01/2022.


**Credits**: Andrea Galassi, Federico Ruggeri, Paolo Torroni

**Summary**: Part-of Speech (POS) tagging as Sequence Labelling using Recurrent Neural Architectures

# Execution
## 0. Utils

In [2]:
import absl.logging
absl.logging.set_verbosity(absl.logging.ERROR)

import re
import pandas as pd
import numpy as np
import os
import urllib.request
import zipfile
import progressbar

import nltk
import sklearn
from keras.preprocessing.text import Tokenizer
from keras_preprocessing.sequence import pad_sequences
from keras.utils.np_utils import to_categorical

import tensorflow as tf
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dense, GRU, TimeDistributed
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.metrics import f1_score

import plotly.express as px
import plotly.graph_objects as go

nltk.download('treebank')
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package treebank to /root/nltk_data...
[nltk_data]   Unzipping corpora/treebank.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

An embedding matrix is a matrix of numerical vectors that represents the words in a vocabulary as low-dimensional dense vectors. These vectors are often used to initialize the weights of a neural network model, such as a long short-term memory (LSTM) network or a Gated Recurrent Unit (GRU).

In [19]:
#Function to return one code encoding of tags
def one_hot_encoding(tag_sents, n_tags):
    tag_one_hot_sent = []
    for tag_sent in tag_sents:
        tags_one_hot = []
        for tag in tag_sent:
            tags_one_hot.append(np.zeros(n_tags))
            tags_one_hot[-1][tag] = 1.0
        tag_one_hot_sent.append(tags_one_hot)
    return np.array(tag_one_hot_sent)

## 1. Corpus
### 1.1 Pre-processing

From the original tags list we removed all the symbols and english punctuation plus:
- FW, Foreign Word, because there are no examples in the test set;
- UH, Interjection, because there are no examples in the test set;
- LS, List Item Marker, because there are no examples in the test set (and because it denotes symbols as well);

In [46]:
# Get the files' list
fileids = nltk.corpus.treebank.fileids()

# Get the Penn Treebank corpus and tokenize the text
train_corpus = nltk.corpus.treebank.tagged_sents(fileids[:100])
val_corpus = nltk.corpus.treebank.tagged_sents(fileids[100:150])
test_corpus = nltk.corpus.treebank.tagged_sents(fileids[150:])

# Flatten the lists
ignore = [':', '#', '"', '$', '-LRB-', '-RRB-', ',', '.', "''", '``', 'SYM', '-NONE-']

# train_corpus = [tuple(list(item)+[str(idx)]) for idx,sublist in enumerate(train_corpus) for item in sublist if item[1] not in ignore]
# val_corpus = [tuple(list(item)+[str(idx)]) for idx,sublist in enumerate(val_corpus) for item in sublist if item[1] not in ignore]
# test_corpus = [tuple(list(item)+[str(idx)]) for idx,sublist in enumerate(test_corpus) for item in sublist if item[1] not in ignore]

train_corpus = [tuple(list(item)+[str(idx)]) for idx,sublist in enumerate(train_corpus) for item in sublist if item[1] != '-NONE-']
val_corpus = [tuple(list(item)+[str(idx)]) for idx,sublist in enumerate(val_corpus) for item in sublist if item[1] != '-NONE-']
test_corpus = [tuple(list(item)+[str(idx)]) for idx,sublist in enumerate(test_corpus) for item in sublist if item[1] != '-NONE-']

In [47]:
train_df = pd.DataFrame(train_corpus, columns = ['word', 'tag', 'sentence'])
# train_df['word'] = pre_process(train_df,'word')

print(train_df.shape)
train_df.describe()

(47356, 3)


Unnamed: 0,word,tag,sentence
count,47356,47356,47356
unique,8009,45,1963
top,",",NN,1854
freq,2570,6270,249


In [48]:
val_df = pd.DataFrame(val_corpus, columns = ['word', 'tag', 'sentence'])
# val_df['word'] = pre_process(val_df,'word')

print(val_df.shape)
val_df.describe()

(31183, 3)


Unnamed: 0,word,tag,sentence
count,31183,31183,31183
unique,5892,44,1299
top,",",NN,339
freq,1528,4513,81


In [49]:
test_df = pd.DataFrame(test_corpus, columns = ['word', 'tag', 'sentence'])
# test_df['word'] = pre_process(test_df,'word')

print(test_df.shape)
test_df.describe()

(15545, 3)


Unnamed: 0,word,tag,sentence
count,15545,15545,15545
unique,3623,40,652
top,",",NN,232
freq,787,2383,58


In [50]:
tags_train = sorted(list(set([x for x in train_df.tag])))
tags_val = sorted(list(set([x for x in val_df.tag])))
tags_test = sorted(list(set([x for x in test_df.tag])))

print('Train tags number:',len(tags_train))
print('Val tags number:',len(tags_val))
print('Test tags number:',len(tags_test))

if len(tags_test) != len(tags_val) or len(tags_test) != len(tags_train):
  print('\nMismatching numbers.')
  print('Removing extra classes:')

  missing_classes_train = [x for x in tags_train if x not in tags_test]
  missing_classes_val   = [x for x in tags_val if x not in tags_test]

  missing_classes = list(set(missing_classes_train + missing_classes_val))
  print(missing_classes)

  for cl in missing_classes:
    train_df = train_df[train_df.tag != cl]
    val_df = val_df[val_df.tag != cl]

  tags_train = sorted(list(set([x for x in train_df.tag])))
  tags_val = sorted(list(set([x for x in val_df.tag])))
  tags_test = sorted(list(set([x for x in test_df.tag])))

  print('\nNew Train tags number:',len(tags_train))
  print('New Val tags number:',len(tags_val))
  print('New Test tags number:',len(tags_test))

print('\nTags:')
for tag in tags_train:
  print(f'-{tag}')

Train tags number: 45
Val tags number: 44
Test tags number: 40

Mismatching numbers.
Removing extra classes:
['SYM', 'FW', 'LS', '#', 'UH']

New Train tags number: 40
New Val tags number: 40
New Test tags number: 40

Tags:
-$
-''
-,
--LRB-
--RRB-
-.
-:
-CC
-CD
-DT
-EX
-IN
-JJ
-JJR
-JJS
-MD
-NN
-NNP
-NNPS
-NNS
-PDT
-POS
-PRP
-PRP$
-RB
-RBR
-RBS
-RP
-TO
-VB
-VBD
-VBG
-VBN
-VBP
-VBZ
-WDT
-WP
-WP$
-WRB
-``


In [51]:
X_train_raw = train_df.groupby('sentence').word.apply(list).reset_index()['word']
y_train_raw = train_df.groupby('sentence').tag.apply(list).reset_index()['tag']

X_val_raw = val_df.groupby('sentence').word.apply(list).reset_index()['word']
y_val_raw = val_df.groupby('sentence').tag.apply(list).reset_index()['tag']

X_test_raw = test_df.groupby('sentence').word.apply(list).reset_index()['word']
y_test_raw = test_df.groupby('sentence').tag.apply(list).reset_index()['tag']

X = [*X_train_raw, *X_val_raw, *X_test_raw]
y = [*y_train_raw, *y_val_raw, *y_test_raw]

# creating sets of words 
words = set([word.lower() for sentence in X for word in sentence])

# bulding vocabulary of words and tags 
word2index = {word: i + 2 for i, word in enumerate(list(words))}
word2index['-PAD-'] = 0  # 0 is assigned for padding
word2index['-OOV-'] = 1  # 1 is assigned for unknown words
tag2index = {tag: i + 1 for i, tag in enumerate(tags_train)}
tag2index['-PAD-'] = 0  # 0 is assigned for padding

vocab_size = len(word2index)

In [52]:
X_train, X_val, X_test, y_train, y_val, y_test = [], [], [], [], [], []

# encode X
for sentence in X_train_raw:
    sent_int = []
    for word in sentence:
        try:
            sent_int.append(word2index[word.lower()])
        except KeyError:
            sent_int.append(word2index['-OOV-'])
    X_train.append(sent_int)

for sentence in X_val_raw:
    sent_int = []
    for word in sentence:
        try:
            sent_int.append(word2index[word.lower()])
        except KeyError:
            sent_int.append(word2index['-OOV-'])
    X_val.append(sent_int)

for sentence in X_test_raw:
    sent_int = []
    for word in sentence:
        try:
            sent_int.append(word2index[word.lower()])
        except KeyError:
            sent_int.append(word2index['-OOV-'])
    X_test.append(sent_int)

# encode Y
for sent_tags in y_train_raw:
    y_train.append([tag2index[tag] for tag in sent_tags])

for sent_tags in y_val_raw:
    y_val.append([tag2index[tag] for tag in sent_tags])

for sent_tags in y_test_raw:
    y_test.append([tag2index[tag] for tag in sent_tags])

In [53]:
print('-Not encoded')
print('\t',X_train_raw[0]) 
print('\t',y_train_raw[0])
print('-Encoded')
print('\t',X_train[0])
print('\t',y_train[0])

-Not encoded
	 ['Pierre', 'Vinken', ',', '61', 'years', 'old', ',', 'will', 'join', 'the', 'board', 'as', 'a', 'nonexecutive', 'director', 'Nov.', '29', '.']
	 ['NNP', 'NNP', ',', 'CD', 'NNS', 'JJ', ',', 'MD', 'VB', 'DT', 'NN', 'IN', 'DT', 'JJ', 'NN', 'NNP', 'CD', '.']
-Encoded
	 [869, 9821, 3723, 58, 9698, 7251, 3723, 2108, 10184, 6408, 2959, 10527, 10918, 7706, 10469, 9996, 3998, 596]
	 [18, 18, 3, 9, 20, 13, 3, 16, 30, 10, 17, 12, 10, 13, 17, 18, 9, 6]


In [54]:
# check length of longest sentence
lengths = [len(seq) for seq in X_train+X_test+X_val]
print("Length of longest sentence: {}".format(max(lengths)))
print("Average length: {}".format(sum(lengths)/len(lengths)))

max_len = max(lengths)
X_train = pad_sequences(X_train,padding='post',maxlen=max_len)
X_val = pad_sequences(X_val,padding='post',maxlen=max_len)
X_test = pad_sequences(X_test,padding='post',maxlen=max_len)

y_train = pad_sequences(y_train,padding='post',maxlen=max_len)
y_val = pad_sequences(y_val,padding='post',maxlen=max_len)
y_test = pad_sequences(y_test,padding='post',maxlen=max_len)

print('-Padded')
print('\tX:',X_train[0])
print('\n\ty:',y_train[0])

# Create a sample weight vector where the weights for the padded samples 
# are set to 0 and the weights for the non-padded samples are set to 1
sample_weight = np.ones(X_train.shape)
for i in range(X_train.shape[0]):
    for j in range(X_train.shape[1]):
        if X_train[i][j] == 0 or y_train_raw[i][j] in ignore:
            sample_weight[i][j] = 0

Length of longest sentence: 249
Average length: 24.028359734287175
-Padded
	X: [  869  9821  3723    58  9698  7251  3723  2108 10184  6408  2959 10527
 10918  7706 10469  9996  3998   596     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0   

In [55]:
y_train_one_hot = to_categorical(y_train)
y_val_one_hot = to_categorical(y_val)
y_test_one_hot = to_categorical(y_test)

# y_train_one_hot = one_hot_encoding(y_train, len(tag2index))
# y_val_one_hot = one_hot_encoding(y_val, len(tag2index))
# y_test_one_hot = one_hot_encoding(y_test, len(tag2index))

## 2. GloVe 
GloVe (Global Vectors for Word Representation) is a method for learning vector representations of words, called "word embeddings," from a large corpus of text. Word embeddings are numerical representations of words that capture the semantic relationships between words in a continuous, low-dimensional space. They are commonly used as input to natural language processing models, such as language translation and language modeling.

GloVe works by learning the co-occurrence statistics of words in a corpus, and using this information to learn word embeddings that capture the semantic relationships between words. The GloVe method produces word embeddings that are trained on a global corpus, as opposed to embeddings that are trained on a specific task or dataset.

There are different versions of the GloVe word embeddings, including 50-dimensional, 100-dimensional, and 200-dimensional embeddings. The 50-dimensional version of GloVe embeddings may be better in some applications because they have a lower dimensionality, which can make them easier to work with and more computationally efficient.

By using GloVe embeddings as the initial weights for a model, we can take advantage of these pre-trained word representations and fine-tune them for a specific task.

In [39]:
pbar = None
def show_progress(block_num, block_size, total_size):
    global pbar
    if pbar is None:
        pbar = progressbar.ProgressBar(maxval=total_size)
        pbar.start()

    downloaded = block_num * block_size
    if downloaded < total_size:
        pbar.update(downloaded)
    else:
        pbar.finish()
        pbar = None

# Download the GloVe embeddings file
url = 'http://nlp.stanford.edu/data/glove.6B.zip'
urllib.request.urlretrieve(url, 'glove.6B.zip', show_progress)

# Extract the zip file
zip_ref = zipfile.ZipFile('glove.6B.zip', 'r')
zip_ref.extractall()
zip_ref.close()

100% (862182613 of 862182613) |##########| Elapsed Time: 0:02:39 Time:  0:02:39


In [56]:
# Load the GloVe embeddings into a dictionary
embedding_dict = {}
embedding_dim = 300
with open(f'glove.6B.{embedding_dim}d.txt', 'r', encoding='utf-8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embedding_dict[word] = coefs

# Print the number of words in the embeddings dictionary
print(f'Found {len(embedding_dict)} word vectors.')

Found 400000 word vectors.


In [58]:
#Building the Embedding matrix 
# embedding_matrix = np.zeros((len(word2index), embedding_dim))
embedding_matrix = np.random.normal(0, 1, (len(word2index), embedding_dim))
for word, i in word2index.items():
    embedding_vector = embedding_dict.get(word)
    if i < len(word2index):
        if embedding_vector is not None:
            # Words not found in embedding index will be all-zeros.
            embedding_matrix[i] = embedding_vector

## 3. Model
### 3.1 Baseline (MACRO f1 0.952)
Bidirectional LSTM layers are able to process sequential data in both the forward and backward directions, which can allow the model to capture contextual information from both the past and the future. This can be particularly useful for natural language processing tasks, where the meaning of a word can depend on the context in which it is used.

In the context of POS tagging, TimeDistributed can be used to apply a tag prediction layer to each word in a sentence. For example, you might have an RNN that processes a sequence of words in a sentence, and at each time step, the RNN outputs a hidden state. You could then apply a TimeDistributed dense layer to the hidden states, which would allow you to predict the POS tag for each word in the sentence.

One advantage of using TimeDistributed for POS tagging is that it allows you to predict the POS tag for each word in the sentence simultaneously, rather than having to process the sentence one word at a time. This can be particularly useful when dealing with long sentences, as it can make the tagging process more efficient.

Overall, using TimeDistributed for POS tagging can help you build more accurate and efficient models for natural language processing tasks that involve sequential data.

In [61]:
# Define the model
baseline_model = tf.keras.Sequential(name='Baseline')

# Add the Embedding layer
baseline_model.add(Embedding(input_dim=vocab_size, output_dim=embedding_dim, \
                    weights = [embedding_matrix], input_length = max_len, trainable=True))

# Add the Bidirectional LSTM layer
baseline_model.add(Bidirectional(LSTM(units=128, return_sequences=True)))

# Add the Dense/Fully-Connected layer
baseline_model.add(TimeDistributed(Dense(units=len(tags_train)+1, activation='softmax')))

# Compile the model
baseline_model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'], sample_weight_mode='temporal')

# Summary
baseline_model.summary()

Model: "Baseline"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_4 (Embedding)     (None, 249, 300)          3282900   
                                                                 
 bidirectional_4 (Bidirectio  (None, 249, 256)         439296    
 nal)                                                            
                                                                 
 time_distributed_4 (TimeDis  (None, 249, 41)          10537     
 tributed)                                                       
                                                                 
Total params: 3,732,733
Trainable params: 3,732,733
Non-trainable params: 0
_________________________________________________________________


In [62]:
results_baseline = baseline_model.fit(X_train, y_train_one_hot, epochs=10, verbose = True,\
                                      validation_data=(X_val,y_val_one_hot), batch_size=128, sample_weight=sample_weight)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [63]:
y_pred = baseline_model.predict(X_test)



In [80]:
print(y_pred.shape)
print(y_test_one_hot.shape)

index2tag = {v: k for k, v in tag2index.items()}


# Flatten the predictions and the true labels to 1D arrays
# predictions = y_pred.flatten()
# y_true = y_test_one_hot.flatten()

th = 1
# predictions[predictions >= th] = 1 
# predictions[predictions  < th] = 0

y_pred[:][:][y_pred >= th] = 1
y_pred[:][:][y_pred < th] = 0
print(y_pred[1][2])
print(y_test_one_hot[1][2])

# Compute the F1 score
# f1_baseline = f1_score(y_true, predictions, average = 'macro')

# print("Macro-F1 score:", round(f1_baseline,3))

# baseline_model.save('./baseline_model')

(652, 249, 41)
(652, 249, 41)
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 1. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0.]
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0.]


### 3.2 GRU (MACRO f1 0.958)
Gated Recurrent Units (GRUs) are a type of recurrent neural network (RNN) that are often used in natural language processing tasks such as part-of-speech (POS) tagging. GRUs are similar to long short-term memory (LSTM) networks, but they have a simpler structure and fewer parameters, making them easier to train and faster to run. In POS tagging, GRUs can be used to process a sequence of words and predict the POS tags for each word in the sequence. GRUs are able to take into account contextual information from the previous words in the sequence, allowing them to make more accurate predictions about the POS tags for the current word. 

Both BiLSTMs (Bidirectional LSTMs) and Gated Recurrent Units (GRUs) have been shown to perform well on a variety of NLP tasks, including POS tagging, but here we obtained slightly better results than with the baseline; the reason may be that LSTMs are are particularly well-suited for tasks that require the model to remember and make use of long-term dependencies in the data, while the longest sentence in the Penn Treebank dataset has only 171 words and the average of words per sentence is around 20.  

In [None]:
# Define the model
gru_model = tf.keras.Sequential(name='GRU')

# Add the Embedding layer
gru_model.add(Embedding(input_dim=vocab_size, output_dim=embedding_dim, \
                    weights = [embedding_matrix], input_length = max_len, trainable=True))

# Add the GRU layer
gru_model.add(GRU(units=128, return_sequences=True))

# Add the Dense/Fully-Connected layer
gru_model.add(TimeDistributed(Dense(len(tags_train)+1, activation='softmax')))

# Compile the model
gru_model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Summary
gru_model.summary()

Model: "GRU"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 171, 300)          2700600   
                                                                 
 gru (GRU)                   (None, 171, 128)          165120    
                                                                 
 time_distributed_1 (TimeDis  (None, 171, 33)          4257      
 tributed)                                                       
                                                                 
Total params: 2,869,977
Trainable params: 2,869,977
Non-trainable params: 0
_________________________________________________________________


In [None]:
results_gru = gru_model.fit(X_train, y_train, epochs=10, verbose = True, validation_data=(X_val,y_val), batch_size=128)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [None]:
y_pred = gru_model.predict(X_test)



In [None]:
# Flatten the predictions and the true labels to 1D arrays
predictions = y_pred.flatten()
y_true = y_test.flatten()

th = 0.1
predictions[predictions >= th] = 1 
predictions[predictions  < th] = 0

# Compute the F1 score
f1_gru = f1_score(y_true, predictions, average = 'macro')

print("Macro-F1 score:", round(f1_gru,3))

gru_model.save('./gru_model')

Macro-F1 score: 0.958


### 3.3 Additional LSTM layer (MACRO f1 0.967) 
Using two BiLSTMs layers can allow the model to learn more complex patterns in the data and make more accurate predictions. 
However, they can increase the computational complexity of our model, which may require more computational resources to train.

With the same number of epochs the results were similar to the baseline and the training process was slower; it is possible that the model with two BiLSTMs is more prone to overfitting, meaning that it is able to fit the training data very well but is less able to generalize to new data. Another possibility is that the model with two BiLSTMs simply takes longer to train. That is why we raised the training epochs to 15, obtaining better results.

In [None]:
# Define the model
add_lstm_model = tf.keras.Sequential(name='Additional_LSTM')

# Add the Embedding layer
add_lstm_model.add(Embedding(input_dim=vocab_size, output_dim=embedding_dim, \
                    weights = [embedding_matrix], input_length = max_len, trainable=True))

# Add the Bidirectional LSTM layer
add_lstm_model.add(Bidirectional(LSTM(units=128, return_sequences=True)))

# Add another LSTM layer
add_lstm_model.add(Bidirectional(LSTM(units=128, return_sequences=True)))

# Add the Dense/Fully-Connected layer
add_lstm_model.add(TimeDistributed(Dense(units=len(tags_train)+1, activation='softmax')))

# Compile the model
add_lstm_model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Summary
add_lstm_model.summary()

Model: "Additional_LSTM"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_2 (Embedding)     (None, 171, 300)          2700600   
                                                                 
 bidirectional_1 (Bidirectio  (None, 171, 256)         439296    
 nal)                                                            
                                                                 
 bidirectional_2 (Bidirectio  (None, 171, 256)         394240    
 nal)                                                            
                                                                 
 time_distributed_2 (TimeDis  (None, 171, 33)          8481      
 tributed)                                                       
                                                                 
Total params: 3,542,617
Trainable params: 3,542,617
Non-trainable params: 0
_________________________________________

In [None]:
results_add_lstm = add_lstm_model.fit(X_train, y_train, epochs=15, verbose = True, validation_data=(X_val,y_val), batch_size=128)

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


In [None]:
y_pred = add_lstm_model.predict(X_test)



In [None]:
# Flatten the predictions and the true labels to 1D arrays
predictions = y_pred.flatten()
y_true = y_test.flatten()

th = 0.1
predictions[predictions >= th] = 1 
predictions[predictions  < th] = 0

# Compute the F1 score
f1_add_lstm = f1_score(y_true, predictions, average = 'macro')

print("Macro-F1 score:", round(f1_add_lstm,3))

add_lstm_model.save('./add_lstm_model')

Macro-F1 score: 0.966


### 3.4 Additional dense layer (MACRO f1 0.969)

Using two dense layers, one with a non-linear activation function and one with a softmax activation function, is a common pattern in neural network architectures for classification tasks.

The purpose of the non-linear dense layer is to introduce non-linearity into the model, which can allow the model to learn more complex patterns in the data. Common choices for the activation function in this layer include ReLU (Rectified Linear Unit), sigmoid, and tanh.

The purpose of the softmax dense layer is to produce a probability distribution over the possible classes. The softmax activation function transforms the output of the preceding layer into a probability distribution, where the sum of the probabilities is equal to 1. This is useful for classification tasks, where you want to predict the probability that an input belongs to each of the possible classes. Using two dense layers in this way can allow the model to learn more complex patterns in the data and make more accurate predictions.

We have increased the number of training epochs to 15 for the same reasons as before.

In [None]:
# Define the model
add_fc_model = tf.keras.Sequential(name='Additional_FC')

# Add the Embedding layer
add_fc_model.add(Embedding(input_dim=vocab_size, output_dim=embedding_dim, \
                    weights = [embedding_matrix], input_length = max_len, trainable=True))

# Add the Bidirectional LSTM layer
add_fc_model.add(Bidirectional(LSTM(units=128, return_sequences=True)))

# Add another Dense layer
add_fc_model.add(TimeDistributed(Dense(units=max_len, activation='relu')))

# Add the Dense/Fully-Connected layer
add_fc_model.add(TimeDistributed(Dense(units=len(tags_train)+1, activation='softmax')))

# Compile the model
add_fc_model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Summary
add_fc_model.summary()

Model: "Additional_FC"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_3 (Embedding)     (None, 171, 300)          2700600   
                                                                 
 bidirectional_3 (Bidirectio  (None, 171, 256)         439296    
 nal)                                                            
                                                                 
 time_distributed_3 (TimeDis  (None, 171, 171)         43947     
 tributed)                                                       
                                                                 
 time_distributed_4 (TimeDis  (None, 171, 33)          5676      
 tributed)                                                       
                                                                 
Total params: 3,189,519
Trainable params: 3,189,519
Non-trainable params: 0
___________________________________________

A smaller value for patience means that the model training will be stopped more quickly if the metric is not improving, which can be useful for avoiding overfitting or for reducing the training time.

In [None]:
results_add_fc = add_fc_model.fit(X_train, y_train, epochs=15, verbose = True, validation_data=(X_val,y_val), batch_size=128)

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


In [None]:
y_pred = add_fc_model.predict(X_test)



In [None]:
# Flatten the predictions and the true labels to 1D arrays
predictions = y_pred.flatten()
y_true = y_test.flatten()

th = 0.1
predictions[predictions >= th] = 1 
predictions[predictions  < th] = 0

# Compute the F1 score
f1_add_fc = f1_score(y_true, predictions, average = 'macro')

print("Macro-F1 score:", round(f1_add_fc,3))

add_fc_model.save('./add_fc_model')

Macro-F1 score: 0.969


## 4. Comparison


In [None]:
# # Create a list of epochs (i.e., the x-axis data)
epochs = list(range(1, len(results_baseline.history['val_accuracy'])+1))

# # Create a Plotly line plot using the epochs and validation accuracy data
fig = go.Figure()
fig.add_trace(go.Scatter(x=epochs, y=results_baseline.history['val_accuracy'], name='Baseline - BiLSTM Model', mode='lines+markers'))
fig.add_trace(go.Scatter(x=epochs, y=results_gru.history['val_accuracy'], name='GRU Model', mode='lines+markers'))
fig.show()

# Create a list of epochs (i.e., the x-axis data)
epochs = list(range(1, len(results_add_lstm.history['val_accuracy'])+1))

fig2 = go.Figure()
fig2.add_trace(go.Scatter(x=epochs, y=results_add_lstm.history['val_accuracy'], name='2 BiLSTMs Model', mode='lines+markers'))
fig2.add_trace(go.Scatter(x=epochs, y=results_add_fc.history['val_accuracy'], name='2 FCs Model', mode='lines+markers'))
fig2.show()

In [None]:
max_width = max(len(str(f1_baseline)), len(str(f1_gru)), len(str(f1_add_lstm)), len(str(f1_add_fc)))

header_row = f'| F1 Score Baseline {" " * (max_width - len("F1 Score Baseline"))} | F1 Score GRU {" " * (max_width - len("F1 Score GRU"))} |\
 F1 Score Add. BiLSTM {" " * (max_width - len("F1 Score Add. BiLSTM"))} | F1 Score Add. Dense {" " * (max_width - len("F1 Score Add. Dense"))} |'
separator_row = '-' * len(header_row)
data_row = f'| {f1_baseline:<{max_width}} | {f1_gru:<{max_width}} | {f1_add_lstm:<{max_width}} | {f1_add_fc:<{max_width}} |'

print(header_row)
print(separator_row)
print(data_row)

| F1 Score Baseline   | F1 Score GRU        | F1 Score Add. BiLSTM  | F1 Score Add. Dense  |
--------------------------------------------------------------------------------------------
| 0.9524195137886767 | 0.9581359536923748 | 0.966182171087751  | 0.9687673190990866 |
