https://github.com/pranavphoenix/BiLSTM-POS-Tagging/blob/main/BiLSTM_POS_Tagging.ipynb

https://linguistics.stackexchange.com/questions/16897/unable-to-understand-meaning-of-tag-none-1-in-penn-treebank-example

TODO:
- Cacasburo
- Guardare creazione dizionario, bisogna rispettare i punti dell'assignment;
- Non togliere punctuation e symbols ma evitare di utilizzarli nel calcolo delle metriche, magari utilizzando l'array di pesi 'sample_weight' che si trova nell'altro notebook;
- Provare se i risultati migliorano con preprocessing (e.g. lowerando le parole);
- Aggiustare il notebook perché fa cagare;

# Assignment 1

**Due to**: 11/01/2022 (dd/mm/yyyy)

If you deliver it by 11/12/2021 your assignment will be graded by 11/01/2022.


**Credits**: Andrea Galassi, Federico Ruggeri, Paolo Torroni

**Summary**: Part-of Speech (POS) tagging as Sequence Labelling using Recurrent Neural Architectures

# Execution
## 0.1 Imports

In [6]:
import nltk
import numpy as np
import os
import re
import random
import pandas as pd
import tensorflow as tf
from collections import defaultdict, OrderedDict

import keras
from keras import backend as K
from keras.layers import Dense, LSTM, InputLayer, Bidirectional, TimeDistributed, GRU, Embedding, Activation
from keras.models import Sequential
from keras.optimizers import Adam
from keras_preprocessing.sequence import pad_sequences
from keras.utils.np_utils import to_categorical
from sklearn.metrics import f1_score

import urllib.request
import zipfile
import progressbar
from IPython.display import display_html
from itertools import chain,cycle

import plotly.graph_objs as go
import plotly.express as px

## 0.2 Functions

In [51]:
# Downloading Glove Word Embeddings
pbar = None
def show_progress(block_num, block_size, total_size):
    global pbar
    if pbar is None:
        pbar = progressbar.ProgressBar(maxval=total_size)
        pbar.start()

    downloaded = block_num * block_size
    if downloaded < total_size:
        pbar.update(downloaded)
    else:
        pbar.finish()
        pbar = None

# -Preprocessing - anlaysis-
# Display dataframes
def display(*args,titles=cycle([''])):
    html_str=''
    for df,title in zip(args, chain(titles,cycle(['</br>'])) ):
        html_str+='<th style="text-align:left"><td style="vertical-align:top">'
        html_str+=f'<h4 style="text-align: left;">{title}</h2>'
        html_str+=df.to_html().replace('table','table style="display:inline"')
        html_str+='</td></th>'
    display_html(html_str,raw=True)

# Show mismatches of classes in different sets
def tags_mismatch(tags1,tags2,tags3,name1,name2,name3):

  print(f'{name1} tags number: {len(tags1)}')
  print(f'{name1} tags list: {tags1}')

  exceeding_validation = [el for el in tags1 if el not in tags2]
  if exceeding_validation != []:
    print(f'\tClasses in {name1} set for which there are no samples in {name2} set: {exceeding_validation}')

  exceeding_test = [el for el in tags1 if el not in tags3]

  if exceeding_test != []:
    print(f'\tClasses in {name1} set for which there are no samples in {name3} set: {exceeding_test}\n')


# Histograms of occurencies of words by tag
def plot_value_counts(df, key, name):
    values = df[key].value_counts()
    fig = px.bar(x=values.index, y=values.values)
    fig.update_layout(xaxis_title=key,
                      yaxis_title='Occurencies of words',
                      title=f'{name} set words per tag')
    fig.show()

# Plot tag distribution per sentence   
def plot_tag_distribution(tag_lists,name):
    tag_counts = []
    for tags in tag_lists:
        tag_dict = {}
        for tag in tags:
            if tag in tag_dict:
                tag_dict[tag] += 1
            else:
                tag_dict[tag] = 1
        tag_counts.append(tag_dict)
    
    df = pd.DataFrame(tag_counts)
    df = df.fillna(0)
    df = df.apply(lambda x: x / sum(x) * 100)
    
    fig = px.line(df, title=f'Tag Distribution per {name} Sentence')
    fig.show()
    
# -Vocabulary-
# Compute embeddings based on the respective tag means.
def mean_embed4tag(df, tags, embedding_dict, embedding_dim):
  tag_dict = {tag:np.zeros(embedding_dim) for tag in tags}
  tag_count = {tag:0 for tag in tags}
  count = 0

  for idx, row in df.iterrows():
    for tag in tags:
      if tag == row['tag']:
        if row['word'] in embedding_dict:
            tag_count[tag] += 1
            tag_dict[tag] += embedding_dict[row['word']]

  for tag in tags:
    if np.all(tag_dict[tag]):
      tag_dict[tag] = tag_dict[tag] / tag_count[tag]
  print(f'Computed mean embeddings for {len(tags)} tags.')
  return tag_dict
   
#Update vocabulary
def update_vocab(df,embeddings_index,tag_dict,embedding_dim,seed=42): 
  oov_c = 0 
  cap_oov = 0

  for idx, row in df.iterrows():
    if row['word'] not in embeddings_index:
      if row['word'].lower() not in embeddings_index:
        oov_c += 1
        noise = np.random.normal(0, 0.0001, size=embedding_dim)
        embeddings_index[row['word']] = tag_dict[row['tag']] + noise       
      else:
        cap_oov += 1
        
        embeddings_index[row['word']] = embeddings_index[row['word'].lower()]
  print(f'Added {oov_c} OOV words + respective embeddings to the vocabulary.')
  print(f'Added {cap_oov} Capitalized words + respective embeddings to the vocabulary.')
  return embeddings_index

# -Metrics-
# Custom metrics
def ignore_class_accuracy(classes=[0]):
    def ignore_accuracy(y_true, y_pred):
        y_true_class = K.argmax(y_true, axis=-1)
        y_pred_class = K.argmax(y_pred, axis=-1)
        
        matches = K.cast(K.equal(y_true_class, y_pred_class), 'int32')
        for to_ignore in classes:
          ignore_mask = K.cast(K.not_equal(y_pred_class, to_ignore), 'int32')
          matches = matches * ignore_mask
        accuracy = K.sum(matches) / K.maximum(K.sum(ignore_mask), 1)
        return accuracy
    return ignore_accuracy

def ignore_class_precision(classes=[0]):
    def ignore_precision(y_true, y_pred):
        y_true = K.round(K.clip(y_true, 0, 1))
        y_pred = K.round(K.clip(y_pred, 0, 1))
        
        true_positives = K.sum(y_true * y_pred)
        predicted_positives = K.sum(y_pred)
        precision = true_positives / (predicted_positives + K.epsilon())
        
        for to_ignore in classes:
            ignore_mask = K.not_equal(K.argmax(y_true, axis=-1), to_ignore)
            precision *= K.cast(ignore_mask, 'float32')
        
        return precision
    return ignore_precision

def ignore_class_recall(classes=[0]):
    def ignore_recall(y_true, y_pred):
        y_true = K.round(K.clip(y_true, 0, 1))
        y_pred = K.round(K.clip(y_pred, 0, 1))
        
        true_positives = K.sum(y_true * y_pred)
        possible_positives = K.sum(y_true)
        recall = true_positives / (possible_positives + K.epsilon())
        
        for to_ignore in classes:
            ignore_mask = K.not_equal(K.argmax(y_true, axis=-1), to_ignore)
            recall *= K.cast(ignore_mask, 'float32')
        
        return recall
    return ignore_recall
    
# Compute the mean of the metrics
def mean_metrics(models):
  ignore_values = {
    'macro_f1': [r['macro_f1'] for r in models],
    'ignore_precision': [r['scores']['ignore_precision'] for r in models],
    'ignore_recall': [r['scores']['ignore_recall'] for r in models]
  }

  mean_ignore_values = {k: np.mean(v) for k, v in ignore_values.items()}

  print("Mean values:", mean_ignore_values)

  return mean_ignore_values

# Setting the seeds
def set_reproducibility(seed):
    random.seed(seed)
    np.random.seed(seed)
    tf.random.set_seed(seed)
    os.environ['TF_DETERMINISTIC_OPS'] = '1'

In [3]:
# Main function to run models/evaluations/predictions/scores.
def run_models(name,layer_params,embedding_params,training_params,metrics,LR,seeds):
  model_recaps = []

  for seed in seeds:
      print(f'Running with seed: {seed}')
      set_reproducibility(seed)

      # Define the model
      model = Sequential(name=f'{name}_{seed}')

      # Add the Embedding layer
      model.add(Embedding(**embedding_params, trainable=False))

      # Add layers
      for layer_param in layer_params:
          layer_type = layer_param['layer_type']
          layer_kwargs = layer_param['layer_kwargs']
          if layer_type == "Bidirectional":
                layer = Bidirectional(LSTM(**layer_kwargs,return_sequences=True))
          elif layer_type == "Dense":
                layer = TimeDistributed(Dense(**layer_kwargs))
          elif layer_type == "GRU":
                layer = GRU(**layer_kwargs,return_sequences=True)
          model.add(layer)

      # Compile the model
      model.compile(optimizer=Adam(LR), loss='categorical_crossentropy', metrics=metrics)

      # Summary
      model.summary()
      
      # Fitting the model
      print(f'\nFitting the {name} model...')
      history = model.fit(**training_params)
      
      # Obtain the predictions made by the model on the validation set
      print(f'Evaluating the {name} model...')
      scores = model.evaluate(X_val, y_val_one_hot, return_dict = True)

      print(f'Obtaining predictions from the {name} model...')
      predictions_one_hot_encode = model.predict(X_val)

      # Convert the class probabilities into class labels
      predictions = np.argmax(predictions_one_hot_encode, axis=-1)

      # Create a binary mask for the classes to exclude
      mask = np.logical_not(np.isin(y_val, [tag2index[tag] for tag in ignore]))

      # Use the mask to exclude the classes that are in the list from the true positive, false positive, and false negative counts
      macro_f1= f1_score(y_val[mask], predictions[mask], average='macro')

      model_recap = {
          "model": model,
          "history": history,
          "scores": scores,
          "predictions": predictions,
          "macro_f1": macro_f1
      }

      model_recaps.append(model_recap)

      print(f'\nMacro f1 score: {macro_f1}\n')

  return model_recaps

In [4]:
# Downloading the dataset
nltk.download('treebank')

# Download the GloVe embeddings file
url = 'http://nlp.stanford.edu/data/glove.6B.zip'
urllib.request.urlretrieve(url, 'glove.6B.zip', show_progress)

# Extract the zip file
zip_ref = zipfile.ZipFile('glove.6B.zip', 'r')
zip_ref.extractall()
zip_ref.close()

[nltk_data] Downloading package treebank to /root/nltk_data...
[nltk_data]   Unzipping corpora/treebank.zip.
100% (862182613 of 862182613) |##########| Elapsed Time: 0:02:38 Time:  0:02:38


## 1. Corpus
### 1.1 Pre-processing

The `-NONE-` tag in the Natural Language Toolkit (NLTK) is used to represent words or tokens that do not have a specific Part-of-Speech (POS) tag. Removing these occurances from the data can be useful for a POS-tagging task as it reduces the noise in the data and improves the quality of the results. By removing the `-NONE-` tags, the model will have fewer examples of unstructured data to learn from and can instead focus on the examples that are more relevant to the task of POS-tagging. This can help the model learn more accurate patterns and relationships between words and their corresponding POS tags, leading to more accurate results in the end.

In [5]:
# Get the files' list
fileids = nltk.corpus.treebank.fileids()

# Get the Penn Treebank tagged sentences
train_corpus = nltk.corpus.treebank.tagged_sents(fileids[:100])
val_corpus = nltk.corpus.treebank.tagged_sents(fileids[100:150])
test_corpus = nltk.corpus.treebank.tagged_sents(fileids[150:])

# Flatten the lists
train_corpus = [tuple(list(item)+[str(idx)]) for idx,sublist in enumerate(train_corpus) for item in sublist if item[1] != '-NONE-']
val_corpus = [tuple(list(item)+[str(idx)]) for idx,sublist in enumerate(val_corpus) for item in sublist if item[1] != '-NONE-']
test_corpus = [tuple(list(item)+[str(idx)]) for idx,sublist in enumerate(test_corpus) for item in sublist if item[1] != '-NONE-']

# Create the Dataframes
train_df = pd.DataFrame(train_corpus, columns = ['word', 'tag', 'sentence'])
val_df = pd.DataFrame(val_corpus, columns = ['word', 'tag', 'sentence'])
test_df = pd.DataFrame(test_corpus, columns = ['word', 'tag', 'sentence'])

# Summary of the created Dataframes
display(train_df.describe(), val_df.describe(), test_df.describe(), titles = [f'Training set {train_df.shape}', f'Validation set {val_df.shape}', f'Test set {test_df.shape}'])

Unnamed: 0,word,tag,sentence
count,47356,47356,47356
unique,8009,45,1963
top,",",NN,1854
freq,2570,6270,249

Unnamed: 0,word,tag,sentence
count,31183,31183,31183
unique,5892,44,1299
top,",",NN,339
freq,1528,4513,81

Unnamed: 0,word,tag,sentence
count,15545,15545,15545
unique,3623,40,652
top,",",NN,232
freq,787,2383,58


The number of words and in particular unique words in each set is different, with the training set having the most and the test set having the least.

The most frequent word in each set is `,` and the most frequent tag is `NN` (noun, singular or mass). This suggests that the datasets might have a large number of common words and that nouns might be the most frequent part of speech in the text, apart from the comma that will be ignored in the final scores computation.

In [11]:
# Ordering tags in the sets
tags_train = sorted(list(set([x for x in train_df.tag])))
tags_val = sorted(list(set([x for x in val_df.tag])))
tags_test = sorted(list(set([x for x in test_df.tag])))

max_tags_list = max([len(tags_train),len(tags_val),len(tags_test)])

# Training set tags list
tags_mismatch(tags_train,tags_val,tags_test,'Training','Validation','Test')

# Validation set tags list
tags_mismatch(tags_val,tags_train,tags_test,'Validation','Training','Test')

# Test set tags list
tags_mismatch(tags_test,tags_train,tags_val,'Test','Training','Validation')

# Histograms of occurencies of words per tag   
plot_value_counts(train_df, 'tag', 'Training')
plot_value_counts(val_df, 'tag', 'Validation')
plot_value_counts(test_df, 'tag', 'Test')

Training tags number: 45
Training tags list: ['#', '$', "''", ',', '-LRB-', '-RRB-', '.', ':', 'CC', 'CD', 'DT', 'EX', 'FW', 'IN', 'JJ', 'JJR', 'JJS', 'LS', 'MD', 'NN', 'NNP', 'NNPS', 'NNS', 'PDT', 'POS', 'PRP', 'PRP$', 'RB', 'RBR', 'RBS', 'RP', 'SYM', 'TO', 'UH', 'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ', 'WDT', 'WP', 'WP$', 'WRB', '``']
	Classes in Training set for which there are no samples in Validation set: ['SYM']
	Classes in Training set for which there are no samples in Test set: ['#', 'FW', 'LS', 'SYM', 'UH']

Validation tags number: 44
Validation tags list: ['#', '$', "''", ',', '-LRB-', '-RRB-', '.', ':', 'CC', 'CD', 'DT', 'EX', 'FW', 'IN', 'JJ', 'JJR', 'JJS', 'LS', 'MD', 'NN', 'NNP', 'NNPS', 'NNS', 'PDT', 'POS', 'PRP', 'PRP$', 'RB', 'RBR', 'RBS', 'RP', 'TO', 'UH', 'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ', 'WDT', 'WP', 'WP$', 'WRB', '``']
	Classes in Validation set for which there are no samples in Test set: ['#', 'FW', 'LS', 'UH']

Test tags number: 40
Test tags list: ['$', "'

The most frequent tags in each set are similar, with Nouns (NN) being the most frequent, followed by prepositions (IN), determiners (DT), and proper nouns (NNP), this suggests that nouns and prepositions are the most frequent parts of speech in the text, and that the datasets are similar in terms of the distribution of tags.



In [39]:
# Retriving prepocessed data and grouping in sentences
X_train_raw = train_df.groupby('sentence').word.apply(list).reset_index()['word']
X_val_raw = val_df.groupby('sentence').word.apply(list).reset_index()['word']
X_test_raw = test_df.groupby('sentence').word.apply(list).reset_index()['word']

y_train_raw = train_df.groupby('sentence').tag.apply(list).reset_index()['tag']
y_val_raw = val_df.groupby('sentence').tag.apply(list).reset_index()['tag']
y_test_raw = test_df.groupby('sentence').tag.apply(list).reset_index()['tag']

In [52]:
# Plot tag distributions per sentence
plot_tag_distribution(y_train_raw,'Training')
plot_tag_distribution(y_val_raw,'Validation')
plot_tag_distribution(y_test_raw,'Test')

##-Vocabulary part-

GloVe Vocabulary (V1)

In [None]:
#Setting seed for reproducibility
set_reproducibility(42)

#Use the 300 dimensional GLove Word Embeddings
glove_dir = './'
embedding_dim = 300
embedding_dict = {} #initialize dictionary
f = open(os.path.join(glove_dir, f'glove.6B.{embedding_dim}d.txt'))
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embedding_dict[word] = coefs
f.close()

print('Found %s word vectors.' % len(embedding_dict))

# Computing mean embeddings per tag
tag_dict = mean_embed4tag(train_df, tags_train, embedding_dict, embedding_dim)

Found 400000 word vectors.
Computed mean embeddings for 45 tags.


V1 + Training set OOV (V2)

In [None]:
# Computing the embeddings for the OOV words found in training set
embedding_dict = update_vocab(train_df,embedding_dict,tag_dict,embedding_dim)

Added 363 OOV words + respective embeddings to the vocabulary.
Added 1983 Capitalized words + respective embeddings to the vocabulary.


V2 + Validation set OOV (V3)

In [None]:
# Computing the embeddings for the OOV words found in validation set
embedding_dict = update_vocab(val_df,embedding_dict,tag_dict,embedding_dim)

Added 190 OOV words + respective embeddings to the vocabulary.
Added 754 Capitalized words + respective embeddings to the vocabulary.


V3 + Test set OOV (V4)

In [None]:
# Computing the embeddings for the OOV words found in test set
embedding_dict = update_vocab(test_df,embedding_dict,tag_dict,embedding_dim)

Added 129 OOV words + respective embeddings to the vocabulary.
Added 326 Capitalized words + respective embeddings to the vocabulary.


In [None]:
def closest_glove_embeddings(word, glove_embeddings, n=5):
  
    target_embedding = glove_embeddings[word]
    similarities = []
    for key, value in glove_embeddings.items():
        cosine_similarity = np.dot(target_embedding, value) / (np.linalg.norm(target_embedding) * np.linalg.norm(value))
        similarities.append((key, cosine_similarity))
    closest_words = sorted(similarities, key=lambda x: x[1], reverse=True)[:n]
    return closest_words

closest = closest_glove_embeddings('constitutional-law', embedding_dict)
print(closest)

[('car-development', 1.0), ('constitutional-law', 1.0), ('mininum-wage', 0.9999996335509684), ('stock-price', 0.9999996192929701), ('flim-flammery', 0.999999616484122)]


In [None]:
# Building the actual word vocabulary

index2word = OrderedDict()
word2index = OrderedDict()

# Adding the entry for padding
index2word[0] = '-PAD-'
word2index['-PAD-'] = 0

curr_idx = 1
for key in embedding_dict.keys():
  word2index[key] = curr_idx
  index2word[curr_idx] = key
  curr_idx += 1

vocab_length = len(word2index) 
print(f'[Debug] Index -> Word vocabulary size: {len(index2word)}')
print(f'[Debug] Word -> Index vocabulary size: {len(word2index)}')

[Debug] Index -> Word vocabulary size: 403746
[Debug] Word -> Index vocabulary size: 403746


In [None]:
# Tag vocabulary

tag2index = OrderedDict()
index2tag = OrderedDict()

# Adding the entry for padding
index2tag[0] = '-PAD-'
tag2index['-PAD-'] = 0

curr_id = 1
for tag in tags_train:
  tag2index[tag] = curr_id
  index2tag[curr_id] = tag
  curr_id += 1

print(f'[Debug] Index -> Tag vocabulary size: {len(index2tag)}')
print(f'[Debug] Tag -> Index vocabulary size: {len(tag2index)}')


[Debug] Index -> Tag vocabulary size: 46
[Debug] Tag -> Index vocabulary size: 46


In [None]:
# Tokenising words and tags by their indexes in vocabulary
X_train_np, X_val_np, X_test_np, y_train_np, y_val_np, y_test_np = [], [], [], [], [], []

# Encode X
for sentence in X_train_raw:
    sent_int = []
    for word in sentence:
            sent_int.append(word2index[word])
    X_train_np.append(sent_int)

for sentence in X_val_raw:
    sent_int = []
    for word in sentence:
            sent_int.append(word2index[word])
    X_val_np.append(sent_int)

for sentence in X_test_raw:
    sent_int = []
    for word in sentence:
            sent_int.append(word2index[word])
    X_test_np.append(sent_int)

# Encode Y
for sent_tags in y_train_raw:
    y_train_np.append([tag2index[tag] for tag in sent_tags])

for sent_tags in y_val_raw:
    y_val_np.append([tag2index[tag] for tag in sent_tags])

for sent_tags in y_test_raw:
    y_test_np.append([tag2index[tag] for tag in sent_tags])

# Examples
print('-Not encoded')
print('\t',X_train_raw[0]) 
print('\t',y_train_raw[0])
print('-Encoded')
print('\t',X_train_np[0])
print('\t',y_train_np[0])

-Not encoded
	 ['Pierre', 'Vinken', ',', '61', 'years', 'old', ',', 'will', 'join', 'the', 'board', 'as', 'a', 'nonexecutive', 'director', 'Nov.', '29', '.']
	 ['NNP', 'NNP', ',', 'CD', 'NNS', 'JJ', ',', 'MD', 'VB', 'DT', 'NN', 'IN', 'DT', 'JJ', 'NN', 'NNP', 'CD', '.']
-Encoded
	 [400001, 400002, 2, 4979, 83, 168, 2, 44, 1430, 1, 535, 20, 8, 128565, 370, 400003, 1264, 3]
	 [21, 21, 4, 10, 23, 15, 4, 19, 35, 11, 20, 14, 11, 15, 20, 21, 10, 7]


In [None]:
# Checking the lengths of the sentences
lengths = [len(sentence) for sentence in X_train_raw]
lengths.sort()

# Showing a boxplot of the lengths of the sentences
fig = px.box(lengths)
fig.update_layout(xaxis_title='',
                  yaxis_title='',
                  title='Words per sentence')
fig.show()

In [None]:
# Maximum words in a sentence
MAX_LENGTH = lengths[-1] 
# Second longest sentence 
PAD_LENGTH = lengths[-2] 

print(f'Length of longest sentence: {MAX_LENGTH}')
print(f'Second longest sentence length: {PAD_LENGTH}')

# Padding the sequences
X_train = pad_sequences(X_train_np, maxlen=PAD_LENGTH, padding='post')
X_val = pad_sequences(X_val_np, maxlen=PAD_LENGTH, padding='post')
X_test = pad_sequences(X_test_np, maxlen=PAD_LENGTH, padding='post')

y_train = pad_sequences(y_train_np, maxlen=PAD_LENGTH, padding='post')
y_val = pad_sequences(y_val_np, maxlen=PAD_LENGTH, padding='post')
y_test = pad_sequences(y_test_np, maxlen=PAD_LENGTH, padding='post')

print('-Padded')
print('\tX:',X_train[0])
print('\n\ty:',y_train[0])

Length of longest sentence: 249
Second longest sentence length: 114
-Padded
	X: [400001 400002      2   4979     83    168      2     44   1430      1
    535     20      8 128565    370 400003   1264      3      0      0
      0      0      0      0      0      0      0      0      0      0
      0      0      0      0      0      0      0      0      0      0
      0      0      0      0      0      0      0      0      0      0
      0      0      0      0      0      0      0      0      0      0
      0      0      0      0      0      0      0      0      0      0
      0      0      0      0      0      0      0      0      0      0
      0      0      0      0      0      0      0      0      0      0
      0      0      0      0      0      0      0      0      0      0
      0      0      0      0      0      0      0      0      0      0
      0      0      0      0]

	y: [21 21  4 10 23 15  4 19 35 11 20 14 11 15 20 21 10  7  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0 

In [None]:
# One hot encoding the sets
y_train_one_hot = to_categorical(y_train, len(tag2index))
y_val_one_hot = to_categorical(y_val, len(tag2index))
y_test_one_hot = to_categorical(y_test, len(tag2index))

## 2. GloVe 
GloVe (Global Vectors for Word Representation) is a method for learning vector representations of words, called "word embeddings," from a large corpus of text. Word embeddings are numerical representations of words that capture the semantic relationships between words in a continuous, low-dimensional space. They are commonly used as input to natural language processing models, such as language translation and language modeling.

GloVe works by learning the co-occurrence statistics of words in a corpus, and using this information to learn word embeddings that capture the semantic relationships between words. The GloVe method produces word embeddings that are trained on a global corpus, as opposed to embeddings that are trained on a specific task or dataset.

There are different versions of the GloVe word embeddings, including 50-dimensional, 100-dimensional, and 200-dimensional embeddings. The 50-dimensional version of GloVe embeddings may be better in some applications because they have a lower dimensionality, which can make them easier to work with and more computationally efficient.

By using GloVe embeddings as the initial weights for a model, we can take advantage of these pre-trained word representations and fine-tune them for a specific task.

In [None]:
#Building the Embedding Layer 
embedding_matrix = np.zeros((len(word2index), embedding_dim))
for word, i in word2index.items():
  if word != '-PAD-':
    embedding_vector = embedding_dict.get(word)
    embedding_matrix[i] = embedding_vector

## 3. Model
### 3.1 Baseline 
Bidirectional LSTM layers are able to process sequential data in both the forward and backward directions, which can allow the model to capture contextual information from both the past and the future. This can be particularly useful for natural language processing tasks, where the meaning of a word can depend on the context in which it is used.

In the context of POS tagging, TimeDistributed can be used to apply a tag prediction layer to each word in a sentence. For example, you might have an RNN that processes a sequence of words in a sentence, and at each time step, the RNN outputs a hidden state. You could then apply a TimeDistributed dense layer to the hidden states, which would allow you to predict the POS tag for each word in the sentence.

One advantage of using TimeDistributed for POS tagging is that it allows you to predict the POS tag for each word in the sentence simultaneously, rather than having to process the sentence one word at a time. This can be particularly useful when dealing with long sentences, as it can make the tagging process more efficient.

Overall, using TimeDistributed for POS tagging can help you build more accurate and efficient models for natural language processing tasks that involve sequential data.

In [None]:
# List of tags to ignore
ignore = [':', '#', '$', '-LRB-', '-RRB-', ',', '.', "''", '``', 'SYM','LS','-PAD-']

# Custom metrics ignoring classes
ignore_accuracy = ignore_class_accuracy([tag2index[tag] for tag in ignore])
ignore_precision = ignore_class_precision([tag2index[tag] for tag in ignore])
ignore_recall = ignore_class_recall([tag2index[tag] for tag in ignore])
metrics = [ignore_accuracy,ignore_precision,ignore_recall]

# Learning Rate
LR = 0.05

# Embedding layer parameters
embedding_params = {'input_dim': vocab_length,'output_dim': embedding_dim,
                    'weights': [embedding_matrix],'input_length': PAD_LENGTH}
# Callbacks
callbacks = [
    tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True,verbose=True),
    tf.keras.callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.3, patience=2, verbose=True, min_lr=0.001)
]

# Seeds
seeds = [23, 42, 69]

In [None]:
baseline_layer_params = [{'layer_type': 'Bidirectional',
                          'layer_kwargs': {'units': 256}},
                         {'layer_type': 'Dense',
                          'layer_kwargs':{'units': len(tag2index),'activation': 'softmax'}}]

bl_training_params = {'x': X_train, 'y': y_train_one_hot, 'validation_data': (X_val, y_val_one_hot),
                  'batch_size': 128, 'epochs': 1, 'callbacks': callbacks} 

baseline_model_recaps = run_models('Baseline',baseline_layer_params,embedding_params,bl_training_params,metrics,LR,seeds)

Running with seed: 23
Model: "Baseline_23"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_9 (Embedding)     (None, 114, 300)          121123800 
                                                                 
 bidirectional_9 (Bidirectio  (None, 114, 512)         1140736   
 nal)                                                            
                                                                 
 time_distributed_9 (TimeDis  (None, 114, 46)          23598     
 tributed)                                                       
                                                                 
Total params: 122,288,134
Trainable params: 1,164,334
Non-trainable params: 121,123,800
_________________________________________________________________
Fitting the Baseline model...
Evaluating the Baseline model...
Obtaining predictions from the Baseline model...

Macro f1 score: 0.303666754122

KeyboardInterrupt: ignored

In [None]:
mean_metrics_bl = mean_metrics(baseline_model_recaps)

Mean values: {'macro_f1': 0.2771633972814825, 'ignore_precision': 0.17996142307917276, 'ignore_recall': 0.1656841735045115}


### 3.2 GRU 
Gated Recurrent Units (GRUs) are a type of recurrent neural network (RNN) that are often used in natural language processing tasks such as part-of-speech (POS) tagging. GRUs are similar to long short-term memory (LSTM) networks, but they have a simpler structure and fewer parameters, making them easier to train and faster to run. In POS tagging, GRUs can be used to process a sequence of words and predict the POS tags for each word in the sequence. GRUs are able to take into account contextual information from the previous words in the sequence, allowing them to make more accurate predictions about the POS tags for the current word. 

Both BiLSTMs (Bidirectional LSTMs) and Gated Recurrent Units (GRUs) have been shown to perform well on a variety of NLP tasks, including POS tagging, but here we obtained slightly better results than with the baseline; the reason may be that LSTMs are are particularly well-suited for tasks that require the model to remember and make use of long-term dependencies in the data, while the longest sentence in the Penn Treebank dataset has only 171 words and the average of words per sentence is around 20.  

In [None]:
gru_layer_params = [{'layer_type': 'GRU',
                     'layer_kwargs': {'units': 256}},
                    {'layer_type': 'Dense',
                     'layer_kwargs':{'units': len(tag2index),'activation': 'softmax'}}]

gru_training_params = {'x': X_train, 'y': y_train_one_hot, 'validation_data': (X_val, y_val_one_hot),
                  'batch_size': 128, 'epochs': 15, 'callbacks': callbacks} 

gru_model_recaps = run_models('GRU',gru_layer_params,embedding_params,gru_training_params,metrics,LR,seeds)

Running with seed: 23
Model: "GRU_23"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_13 (Embedding)    (None, 114, 300)          121123800 
                                                                 
 gru_1 (GRU)                 (None, 114, 256)          428544    
                                                                 
 time_distributed_12 (TimeDi  (None, 114, 46)          11822     
 stributed)                                                      
                                                                 
Total params: 121,564,166
Trainable params: 440,366
Non-trainable params: 121,123,800
_________________________________________________________________

Fitting the GRU model...
Epoch 1/15

KeyboardInterrupt: ignored

In [None]:
mean_metrics_gru = mean_metrics(gru_model_recaps)

### 3.3 Additional LSTM layer 
Using two BiLSTMs layers can allow the model to learn more complex patterns in the data and make more accurate predictions. 
However, they can increase the computational complexity of our model, which may require more computational resources to train.

With the same number of epochs the results were similar to the baseline and the training process was slower; it is possible that the model with two BiLSTMs is more prone to overfitting, meaning that it is able to fit the training data very well but is less able to generalize to new data. Another possibility is that the model with two BiLSTMs simply takes longer to train. That is why we raised the training epochs to 20, obtaining better results.

In [None]:
LR = 0.01

add_lstm_layer_params = [{'layer_type': 'Bidirectional',
                          'layer_kwargs': {'units': 256}},
                         {'layer_type': 'Bidirectional',
                          'layer_kwargs': {'units': 256}},
                         {'layer_type': 'Dense',
                          'layer_kwargs':{'units': len(tag2index),'activation': 'softmax'}}]

add_lstm_training_params = {'x': X_train, 'y': y_train_one_hot, 'validation_data': (X_val, y_val_one_hot),
                  'batch_size': 128, 'epochs': 25, 'callbacks': callbacks} 

add_lstm_model_recaps = run_models('Additional_LSTM',add_lstm_layer_params,embedding_params,add_lstm_training_params,metrics,LR,seeds)

Running with seed: 23
Model: "Additional_LSTM_23"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_14 (Embedding)    (None, 114, 300)          121123800 
                                                                 
 bidirectional_11 (Bidirecti  (None, 114, 512)         1140736   
 onal)                                                           
                                                                 
 bidirectional_12 (Bidirecti  (None, 114, 512)         1574912   
 onal)                                                           
                                                                 
 time_distributed_13 (TimeDi  (None, 114, 46)          23598     
 stributed)                                                      
                                                                 
Total params: 123,863,046
Trainable params: 2,739,246
Non-trainable params: 121,123,800
____

KeyboardInterrupt: ignored

In [None]:
mean_metrics_add_lstm = mean_metrics(add_lstm_model_recaps)

0.6530649956508587


### 3.4 Additional dense layer

Using two dense layers, one with a non-linear activation function and one with a softmax activation function, is a common pattern in neural network architectures for classification tasks.

The purpose of the non-linear dense layer is to introduce non-linearity into the model, which can allow the model to learn more complex patterns in the data. Common choices for the activation function in this layer include ReLU (Rectified Linear Unit), sigmoid, and tanh.

The purpose of the softmax dense layer is to produce a probability distribution over the possible classes. The softmax activation function transforms the output of the preceding layer into a probability distribution, where the sum of the probabilities is equal to 1. This is useful for classification tasks, where you want to predict the probability that an input belongs to each of the possible classes. Using two dense layers in this way can allow the model to learn more complex patterns in the data and make more accurate predictions.

We have increased the number of training epochs to 15 for the same reasons as before.

In [None]:
LR = 0.01

add_fc_layer_params = [{'layer_type': 'Bidirectional',
                          'layer_kwargs': {'units': 256}},
                         {'layer_type': 'Dense',
                          'layer_kwargs':{'units': PAD_LENGTH,'activation': 'relu'}},
                         {'layer_type': 'Dense',
                          'layer_kwargs':{'units': len(tag2index),'activation': 'softmax'}}]

add_fc_training_params = {'x': X_train, 'y': y_train_one_hot, 'validation_data': (X_val, y_val_one_hot),
                          'batch_size': 128, 'epochs': 25, 'callbacks': callbacks} 

add_fc_model_recaps = run_models('Additional_FC',add_fc_layer_params,embedding_params,add_fc_training_params,metrics,LR,seeds)

Running with seed: 23
Model: "Additional_FC_23"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_15 (Embedding)    (None, 114, 300)          121123800 
                                                                 
 bidirectional_13 (Bidirecti  (None, 114, 512)         1140736   
 onal)                                                           
                                                                 
 time_distributed_14 (TimeDi  (None, 114, 114)         58482     
 stributed)                                                      
                                                                 
 time_distributed_15 (TimeDi  (None, 114, 46)          5290      
 stributed)                                                      
                                                                 
Total params: 122,328,308
Trainable params: 1,204,508
Non-trainable params: 121,123,800
______

KeyboardInterrupt: ignored

In [None]:
mean_metrics_add_fc = mean_metrics(add_fc_model_recaps)

## 4. Comparisons


In [None]:
# Sample data
data = [mean_metrics_bl['macro_f1'], mean_metrics_gru['macro_f1'],
        mean_metrics_add_lstm['macro_f1'], mean_metrics_add_fc['macro_f1']]

# Create figure
fig = go.Figure(data=[go.Bar(x=['Baseline','GRU', 'Add. LSTM', 'Add. FC'], y=data)])

# Set titles for axes
fig.update_layout(xaxis_title='Model type', yaxis_title='Mean Macro F1')

# Show plot
fig.show()


In [None]:
#Test phrase
np.random.seed(42)
idx = np.random.randint(0, len(X_val_np))

print(f'Test sentence: {[index2word[word] for word in X_val_np[idx]]}') 

#ground truth
print(f'Ground truth tags: {[index2tag[tag] for tag in y_val_np[idx]]}')

#test bl
print(f'Baseline predictions: {[index2tag[tag] for tag in baseline_model_recaps[1]["predictions"][idx]]}')
#test gru
print(f'GRU predictions: {[index2tag[tag] for tag in gru_model_recaps[1]["predictions"][idx]]}')
#test add lstm
print(f'Additional LSTM layer predictions: {[index2tag[tag] for tag in add_lstm_model_recaps[1]["predictions"][idx]]}')
#test add fc
print(f'Additional FC layer predictions: {[index2tag[tag] for tag in add_fc_model_recaps[1]["predictions"][idx]]}')



Test sentence: ['$', '107', 'million', 'of', 'tax', 'allocation', 'bonds', ',', '1989', 'Series', 'A-D', ',', 'due', '1991-1999', ',', '2009', 'and', '2019', ',', 'tentatively', 'priced', 'by', 'a', 'Donaldson', 'Lufkin', '&', 'Jenrette', 'Securities', 'Corp.', 'group', 'to', 'yield', 'from', '6.40', '%', 'in', '1991', 'to', '7.458', '%', 'in', '2019', '.']
Ground truth tags: ['$', 'CD', 'CD', 'IN', 'NN', 'NN', 'NNS', ',', 'CD', 'NNP', 'NNP', ',', 'JJ', 'CD', ',', 'CD', 'CC', 'CD', ',', 'RB', 'VBN', 'IN', 'DT', 'NNP', 'NNP', 'CC', 'NNP', 'NNPS', 'NNP', 'NN', 'TO', 'VB', 'IN', 'CD', 'NN', 'IN', 'CD', 'TO', 'CD', 'NN', 'IN', 'CD', '.']
Baseline predictions: ['$', 'CD', 'CD', 'IN', 'NN', 'JJ', 'NNS', ',', 'CD', 'JJ', 'NNP', ',', 'JJ', 'NNP', ',', 'NNP', 'CC', 'CD', ',', 'RB', 'VBN', 'IN', 'DT', 'NNP', 'NNP', 'CC', 'NNP', 'NNP', 'NNP', 'NNP', 'TO', 'NN', 'IN', 'CD', 'NN', 'IN', 'CD', 'TO', 'CD', 'NN', 'IN', 'CD', '.', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', 

In [None]:
#Computing macro scores on test set
#Additional LSTM layer
scores_add_lstm = add_lstm_model.evaluate(X_test, y_test_one_hot, return_dict = True)

predictions_add_lstm_one_hot_encode = add_lstm_model.predict(X_test)

# Convert the class probabilities into class labels
predictions_add_lstm = np.argmax(predictions_add_lstm_one_hot_encode, axis=-1)

# Create a binary mask for the classes to exclude
mask = np.logical_not(np.isin(y_test, [tag2index[tag] for tag in ignore]))

# Use the mask to exclude the classes that are in the list from the true positive, false positive, and false negative counts
macro_f1_add_lstm = f1_score(y_test[mask], predictions_add_lstm[mask], average='macro')

print(macro_f1_add_lstm)

0.7260165908799573


In [None]:
#Computing macro scores on test set
#Additional FC layer
scores_add_fc = add_fc_model.evaluate(X_test, y_test_one_hot, return_dict = True)

predictions_add_fc_one_hot_encode = add_fc_model.predict(X_test)

# Convert the class probabilities into class labels
predictions_add_fc = np.argmax(predictions_add_fc_one_hot_encode, axis=-1)

# Create a binary mask for the classes to exclude
mask = np.logical_not(np.isin(y_test, [tag2index[tag] for tag in ignore]))

# Use the mask to exclude the classes that are in the list from the true positive, false positive, and false negative counts
macro_f1_add_fc = f1_score(y_test[mask], predictions_add_fc[mask], average='macro')

print(macro_f1_add_fc)

0.7575634250338997


In [None]:
# # Create a list of epochs (i.e., the x-axis data)
epochs = list(range(1, len(results_baseline.history['val_accuracy'])+1))

# # Create a Plotly line plot using the epochs and validation accuracy data
fig = go.Figure()
fig.add_trace(go.Scatter(x=epochs, y=results_baseline.history['val_accuracy'], name='Baseline - BiLSTM Model', mode='lines+markers'))
fig.add_trace(go.Scatter(x=epochs, y=results_gru.history['val_accuracy'], name='GRU Model', mode='lines+markers'))
fig.show()

# Create a list of epochs (i.e., the x-axis data)
epochs = list(range(1, len(results_add_lstm.history['val_accuracy'])+1))

fig2 = go.Figure()
fig2.add_trace(go.Scatter(x=epochs, y=results_add_lstm.history['val_accuracy'], name='2 BiLSTMs Model', mode='lines+markers'))
fig2.add_trace(go.Scatter(x=epochs, y=results_add_fc.history['val_accuracy'], name='2 FCs Model', mode='lines+markers'))
fig2.show()