https://github.com/pranavphoenix/BiLSTM-POS-Tagging/blob/main/BiLSTM_POS_Tagging.ipynb

https://linguistics.stackexchange.com/questions/16897/unable-to-understand-meaning-of-tag-none-1-in-penn-treebank-example

TODO:
- Cacasburo
- Guardare creazione dizionario, bisogna rispettare i punti dell'assignment;
- Non togliere punctuation e symbols ma evitare di utilizzarli nel calcolo delle metriche, magari utilizzando l'array di pesi 'sample_weight' che si trova nell'altro notebook;
- Provare se i risultati migliorano con preprocessing (e.g. lowerando le parole);
- Aggiustare il notebook perché fa cagare;

# Assignment 1

**Due to**: 11/01/2022 (dd/mm/yyyy)

If you deliver it by 11/12/2021 your assignment will be graded by 11/01/2022.


**Credits**: Andrea Galassi, Federico Ruggeri, Paolo Torroni

**Summary**: Part-of Speech (POS) tagging as Sequence Labelling using Recurrent Neural Architectures

# Execution
## 0.1 Imports

In [2]:
%pip install keras_preprocessing

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting keras_preprocessing
  Downloading Keras_Preprocessing-1.1.2-py2.py3-none-any.whl (42 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.6/42.6 KB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: keras_preprocessing
Successfully installed keras_preprocessing-1.1.2


In [3]:
import nltk
import numpy as np
import os
import re
import random
import pandas as pd
import tensorflow as tf
from collections import defaultdict, OrderedDict

import keras
from keras import backend as K
from keras.layers import Dense, LSTM, InputLayer, Bidirectional, TimeDistributed, GRU, Embedding, Activation
from keras.models import Sequential
from keras.optimizers import Adam
from keras_preprocessing.sequence import pad_sequences
from keras.utils.np_utils import to_categorical
from sklearn.metrics import f1_score

import urllib.request
import zipfile
import progressbar
from IPython.display import display_html
from itertools import chain,cycle

import plotly.graph_objs as go
import plotly.express as px

import pickle
import gc


## 0.2 Functions

### Utils - data analysis - plots

In [4]:
# Progress bar
pbar = None
def show_progress(block_num, block_size, total_size):
    global pbar
    if pbar is None:
        pbar = progressbar.ProgressBar(maxval=total_size)
        pbar.start()

    downloaded = block_num * block_size
    if downloaded < total_size:
        pbar.update(downloaded)
    else:
        pbar.finish()
        pbar = None

# Setting the seeds
def set_reproducibility(seed):
    random.seed(seed)
    np.random.seed(seed)
    tf.random.set_seed(seed)
    os.environ['TF_DETERMINISTIC_OPS'] = '1'
           
# Display dataframes
def display(*args,titles=cycle([''])):
    html_str=''
    for df,title in zip(args, chain(titles,cycle(['</br>'])) ):
        html_str+='<th style="text-align:left"><td style="vertical-align:top">'
        html_str+=f'<h4 style="text-align: left;">{title}</h2>'
        html_str+=df.to_html().replace('table','table style="display:inline"')
        html_str+='</td></th>'
    display_html(html_str,raw=True)

# Show mismatches of classes in different sets
def tags_mismatch(tags1,tags2,tags3,name1,name2,name3):

  print(f'{name1} tags number: {len(tags1)}')
  print(f'{name1} tags list: {tags1}')

  exceeding_validation = [el for el in tags1 if el not in tags2]
  if exceeding_validation != []:
    print(f'\tClasses in {name1} set for which there are no samples in {name2} set: {exceeding_validation}')

  exceeding_test = [el for el in tags1 if el not in tags3]

  if exceeding_test != []:
    print(f'\tClasses in {name1} set for which there are no samples in {name3} set: {exceeding_test}\n')


# Histograms of occurencies of words by tag
def plot_value_counts(df, key, name):
    values = df[key].value_counts()
    fig = px.bar(x=values.index, y=values.values)
    fig.update_layout(xaxis_title=key,
                      yaxis_title='Occurencies of words',
                      title=f'{name} set words per tag')
    fig.show()

# Plot tag distribution per sentence   
def plot_tag_distribution(tag_lists,name):
    tag_counts = []
    for tags in tag_lists:
        tag_dict = {}
        for tag in tags:
            if tag in tag_dict:
                tag_dict[tag] += 1
            else:
                tag_dict[tag] = 1
        tag_counts.append(tag_dict)
    
    df = pd.DataFrame(tag_counts)
    df = df.fillna(0)
    df = df.apply(lambda x: x / sum(x) * 100)
    
    fig = px.line(df, title=f'Tag Distribution per {name} Sentence')
    fig.show()

# Plot training metrics
def plot_train(baseline_model_recaps,gru_model_recaps,add_lstm_model_recaps,add_fc_model_recaps,metric,ep=15):
  epochs = np.arange(1,ep,1)
  fig = go.Figure()
  for i in range(len(baseline_model_recaps)):
    fig.add_trace(go.Scatter(x=epochs, y=baseline_model_recaps[i]['history'].history[metric], name=baseline_model_recaps[i]['name'], mode='lines+markers',marker=dict(color=f'rgb(250,{i*90},0)')))
  for i in range(len(gru_model_recaps)):
    fig.add_trace(go.Scatter(x=epochs, y=gru_model_recaps[i]['history'].history[metric], name=gru_model_recaps[i]['name'], mode='lines+markers',marker=dict(color=f'rgb(0,{i*90},250)')))
  for i in range(len(add_lstm_model_recaps)):
    fig.add_trace(go.Scatter(x=epochs, y=add_lstm_model_recaps[i]['history'].history[metric], name=add_lstm_model_recaps[i]['name'], mode='lines+markers',marker=dict(color=f'rgb(120,{i*50},200)')))
  for i in range(len(add_fc_model_recaps)):
    fig.add_trace(go.Scatter(x=epochs, y=add_fc_model_recaps[0]['history'].history[metric], name=add_fc_model_recaps[i]['name'], mode='lines+markers',marker=dict(color=f'rgb(0,250,{i*90})')))
  fig.update_layout(title=f'{metric} during training', height=750)

  fig.show()

### Vocabulary and OOV handling

To compute the embeddings for out-of-vocabulary (OOV) words, we took the mean of existing embeddings related to the words with the same part of speech (POS) tag and added noise. This approach is based on the assumption that words with similar POS tags are also semantically similar, and therefore, their embeddings should be similar. Taking the mean of the existing embeddings provides a general representation of the semantic space of the words with similar POS tags, and adding noise helps to avoid overfitting by making the embeddings for the OOV words slightly different from each other.

This approach can be effective in some cases, especially if the number of OOV words is small and their semantic similarity to the in-vocabulary (IV) words is high. However, it's important to keep in mind that this approach may not always be the best option, as the quality of the OOV word embeddings depends on the quality and diversity of the IV word embeddings used to compute the mean. If the IV word embeddings are not representative of the words with similar POS tags, the OOV word embeddings may not be of good quality.





In [5]:
# Compute embeddings based on the respective tag means.
def mean_embed4tag(df, tags, embedding_dict, embedding_dim):
  tag_dict = {tag:np.zeros(embedding_dim) for tag in tags}
  tag_count = {tag:0 for tag in tags}

  for _ , row in df.iterrows():
    for tag in tags:
      if tag == row['tag']:
        if row['word'].lower() in embedding_dict:
            tag_count[tag] += 1
            tag_dict[tag] += embedding_dict[row['word'].lower()]

  for tag in tags:
    if np.all(tag_dict[tag]):
      tag_dict[tag] = tag_dict[tag] / tag_count[tag]
  print(f'Computed mean embeddings for {len(tags)} tags.')
  return tag_dict
   
#Update vocabulary
def update_vocab(df,embeddings_index,tag_dict,embedding_dim,seed=42): 
  oov_c = 0 
  cap_oov = 0
  np.random.seed(seed)
  for _ , row in df.iterrows():
    if row['word'] not in embeddings_index:
      if row['word'].lower() not in embeddings_index:
        oov_c += 1
        noise = np.random.normal(0, 0.0001, size=embedding_dim)
        embeddings_index[row['word']] = tag_dict[row['tag']] + noise       
      else:
        cap_oov += 1  
        embeddings_index[row['word']] = embeddings_index[row['word'].lower()]

  counts = [oov_c,cap_oov,0]
  print(f'Added {oov_c} OOV words + respective embeddings to the vocabulary.')
  print(f'Added {cap_oov} Capitalized words + respective embeddings to the vocabulary.')
  return embeddings_index, counts

# Encode sentences and tags
def encode_sentences(raw_sentences,raw_tags,vocab,tags):
  encoded_sentences = []
  encoded_tags = []
  for sentence in raw_sentences:
      sent_int = []
      for word in sentence:
            sent_int.append(vocab[word])
      encoded_sentences.append(sent_int)

  for sent_tags in raw_tags:
    encoded_tags.append([tags[tag] for tag in sent_tags])

  return encoded_sentences, encoded_tags

### Custom metrics

This function first computes the per-sample accuracy, which is a binary tensor indicating whether the prediction for each sample is correct or not. Then, it multiplies the per-sample accuracy with the weights for the corresponding true class to obtain a weighted per-sample accuracy.

Next, it creates a binary ignore mask indicating which samples should be ignored in the computation of the overall accuracy. The mask is initialized as all ones and then updated to exclude the samples with the class labels specified in the classes argument.

Finally, it computes the overall weighted accuracy by summing the weighted per-sample accuracy and dividing by the number of non-ignored samples.

In [6]:
# Custom metric
# Weighted ignore class accuracy

def ignore_class_accuracy(weights, classes=[]):
  
    @tf.function
    def weighted_ignore_accuracy(y_true, y_pred):
        y_true_class = K.argmax(y_true, axis=-1)
        y_pred_class = K.argmax(y_pred, axis=-1)
        
        per_sample_accuracies = K.cast(K.equal(y_true_class, y_pred_class), 'float32')
        weighted_per_sample_accuracies = per_sample_accuracies * K.gather(weights, y_true_class)
        
        ignore_mask = K.ones_like(y_pred_class, dtype='int32')
        for to_ignore in classes:
          ignore_mask = K.cast(K.not_equal(y_pred_class, to_ignore), 'int32') * ignore_mask
        
        weighted_acc = K.sum(weighted_per_sample_accuracies * K.cast(ignore_mask, 'float32')) / K.maximum(K.cast(K.sum(ignore_mask), 'float32'), 1)


        return weighted_acc
    return weighted_ignore_accuracy


    
# Compute the mean of the metrics
def mean_metrics(models):
  ignore_values = {
    'macro_f1': [r['macro_f1'] for r in models],
    'weighted_ignore_accuracy': [r['scores']['weighted_ignore_accuracy'] for r in models]
  }

  mean_ignore_values = {k: np.mean(v) for k, v in ignore_values.items()}

  print("Mean values:", mean_ignore_values)

  return mean_ignore_values

def compute_weights(df, tag2index):
    # Words per tag in train set
    tag_counts = df['tag'].value_counts()
    # Encoding and sorting by the tag vocab index
    index = tag_counts.index.map(lambda x: tag2index.get(x, 0))
    encoded_tc = pd.DataFrame(tag_counts.values, index=index)
    encoded_tc = encoded_tc.sort_index()
    sorted_tc = encoded_tc.values
    # Normalizing the values
    weights = sorted_tc / sorted_tc.sum()
    # Adding the pad weight
    weights = np.insert(weights, 0, 0.01)
    # Reversing the values for weights
    weights = [1 - i for i in weights]
    print('Weights: ')
    print(weights)

    return weights

In [46]:
# Main function to run models/evaluations/predictions/scores.
def run_models(name,layer_params,embedding_params,training_params,metrics,LR,seeds):
  model_recaps = []

  for seed in seeds:
      print(f'Running with seed: {seed}')
      set_reproducibility(seed)
      
      # Define the model
      tf.keras.backend.clear_session()
      names = f'{name}_{seed}'
      model = Sequential(name=names)
      
      # Add the Embedding layer
      model.add(Embedding(**embedding_params, trainable=False))

      # Add layers
      for layer_param in layer_params:
          layer_type = layer_param['layer_type']
          layer_kwargs = layer_param['layer_kwargs']
          if layer_type == "Bidirectional":
                layer = Bidirectional(LSTM(**layer_kwargs,return_sequences=True))
          elif layer_type == "Dense":
                layer = TimeDistributed(Dense(**layer_kwargs))
          elif layer_type == "GRU":
                layer = GRU(**layer_kwargs,return_sequences=True)
          model.add(layer)

      # Compile the model
      model.compile(optimizer=Adam(LR), loss='categorical_crossentropy', metrics=metrics)

      # Summary
      model.summary()
      tf.keras.utils.plot_model(model,to_file=f'{name}.png')
      
      # Fitting the model
      print(f'\nFitting the {name} (seed {seed}) model...')
      history = model.fit(**training_params)
      
      # Obtain the predictions made by the model on the validation set
      print(f'Evaluating the {name} (seed {seed}) model...')
      scores = model.evaluate(X_val, y_val_one_hot, return_dict = True)

      print(f'Obtaining predictions from the {name} (seed {seed}) model...')
      predictions_one_hot_encode = model.predict(X_val)

      # Convert the class probabilities into class labels
      predictions = np.argmax(predictions_one_hot_encode, axis=-1)

      # Create a binary mask for the classes to exclude
      mask = np.logical_not(np.isin(y_val, [tag2index[tag] for tag in ignore]))

      # Use the mask to exclude the classes that are in the list from the true positive, false positive, and false negative counts
      macro_f1= f1_score(y_val[mask], predictions[mask], average='macro')

      model_recap = {
          "name": names,
          "model": model,
          "history": history,
          "scores": scores,
          "predictions": predictions,
          "macro_f1": macro_f1
      }

      model_recaps.append(model_recap)

      print(f'\nMacro f1 score: {macro_f1}\n')
      
      
      print(f'Garbage collection: {gc.collect()}')

  return model_recaps

# Test function
def test_f(model_recaps,X_test,X_test_np,y_test,y_test_np,tag2index,index2tag,index2word,ignore):
  mean_test = []
  for i in range(len(model_recaps)):
    print(f'Evaluating model {i+1} with the Test set:')

    predictions_one_hot_encode = model_recaps[i]["model"].predict(X_test)

    # Convert the class probabilities into class labels
    predictions = np.argmax(predictions_one_hot_encode, axis=-1)

    # Create a binary mask for the classes to exclude
    mask = np.logical_not(np.isin(y_test, [tag2index[tag] for tag in ignore]))

    # Use the mask to exclude the classes that are in the list from the true positive, false positive, and false negative counts
    macro_f1 = f1_score(y_test[mask], predictions[mask], average='macro')
    mean_test.append(macro_f1)
    print(f'Macro f1 score for model {i+1}: {macro_f1}\n')

  print(f'Mean macro f1 score on test set: {np.mean(mean_test)}\n')

  # POS-tagging a random test phrase
  idx = np.random.randint(0, len(X_test_np))

  print(f'Test sentence: {[index2word[word] for word in X_test_np[idx]]}') 

  # Ground truth
  print(f'Ground truth tags: {[index2tag[tag] for tag in y_test_np[idx]]}')

  # Test 
  print(f'Predictions: {[index2tag[tag] for tag in predictions[idx] if index2tag[tag] != "-PAD-"]}')

  return mean_test, predictions

In [8]:
# Downloading the dataset
nltk.download('treebank')

# Download the GloVe embeddings file
url = 'http://nlp.stanford.edu/data/glove.6B.zip'
urllib.request.urlretrieve(url, 'glove.6B.zip', show_progress)

# Extract the zip file
zip_ref = zipfile.ZipFile('glove.6B.zip', 'r')
zip_ref.extractall()

[nltk_data] Downloading package treebank to /root/nltk_data...
[nltk_data]   Unzipping corpora/treebank.zip.
100% (862182613 of 862182613) |##########| Elapsed Time: 0:02:40 Time:  0:02:40


## 1. Corpus
### 1.1 Pre-processing

The `-NONE-` tag in the Natural Language Toolkit (NLTK) is used to represent words or tokens that do not have a specific Part-of-Speech (POS) tag. Removing these occurances from the data can be useful for a POS-tagging task as it reduces the noise in the data and improves the quality of the results. By removing the `-NONE-` tags, the model will have fewer examples of unstructured data to learn from and can instead focus on the examples that are more relevant to the task of POS-tagging. This can help the model learn more accurate patterns and relationships between words and their corresponding POS tags, leading to more accurate results in the end.

In [9]:
# Get the files' list
fileids = nltk.corpus.treebank.fileids()

# Get the Penn Treebank tagged sentences
train_corpus = nltk.corpus.treebank.tagged_sents(fileids[:100])
val_corpus = nltk.corpus.treebank.tagged_sents(fileids[100:150])
test_corpus = nltk.corpus.treebank.tagged_sents(fileids[150:])

# Flatten the lists
train_corpus = [tuple(list(item)+[str(idx)]) for idx,sublist in enumerate(train_corpus) for item in sublist if item[1] != '-NONE-']
val_corpus = [tuple(list(item)+[str(idx)]) for idx,sublist in enumerate(val_corpus) for item in sublist if item[1] != '-NONE-']
test_corpus = [tuple(list(item)+[str(idx)]) for idx,sublist in enumerate(test_corpus) for item in sublist if item[1] != '-NONE-']

# Create the Dataframes
train_df = pd.DataFrame(train_corpus, columns = ['word', 'tag', 'sentence'])
val_df = pd.DataFrame(val_corpus, columns = ['word', 'tag', 'sentence'])
test_df = pd.DataFrame(test_corpus, columns = ['word', 'tag', 'sentence'])

# Summary of the created Dataframes
display(train_df.describe(), val_df.describe(), test_df.describe(), titles = [f'Training set {train_df.shape}', f'Validation set {val_df.shape}', f'Test set {test_df.shape}'])

Unnamed: 0,word,tag,sentence
count,47356,47356,47356
unique,8009,45,1963
top,",",NN,1854
freq,2570,6270,249

Unnamed: 0,word,tag,sentence
count,31183,31183,31183
unique,5892,44,1299
top,",",NN,339
freq,1528,4513,81

Unnamed: 0,word,tag,sentence
count,15545,15545,15545
unique,3623,40,652
top,",",NN,232
freq,787,2383,58


The number of words and in particular unique words in each set is different, with the training set having the most and the test set having the least.

The most frequent word in each set is `,` and the most frequent tag is `NN` (noun, singular or mass). This suggests that the datasets might have a large number of common words and that nouns might be the most frequent part of speech in the text, apart from the comma that will be ignored in the final scores computation.

In [10]:
# Ordering tags in the sets
tags_train = sorted(list(set([x for x in train_df.tag])))
tags_val = sorted(list(set([x for x in val_df.tag])))
tags_test = sorted(list(set([x for x in test_df.tag])))

max_tags_list = max([len(tags_train),len(tags_val),len(tags_test)])

# Training set tags list
tags_mismatch(tags_train,tags_val,tags_test,'Training','Validation','Test')

# Validation set tags list
tags_mismatch(tags_val,tags_train,tags_test,'Validation','Training','Test')

# Test set tags list
tags_mismatch(tags_test,tags_train,tags_val,'Test','Training','Validation')

# Histograms of occurencies of words per tag   
plot_value_counts(train_df, 'tag', 'Training')
plot_value_counts(val_df, 'tag', 'Validation')
plot_value_counts(test_df, 'tag', 'Test')

Training tags number: 45
Training tags list: ['#', '$', "''", ',', '-LRB-', '-RRB-', '.', ':', 'CC', 'CD', 'DT', 'EX', 'FW', 'IN', 'JJ', 'JJR', 'JJS', 'LS', 'MD', 'NN', 'NNP', 'NNPS', 'NNS', 'PDT', 'POS', 'PRP', 'PRP$', 'RB', 'RBR', 'RBS', 'RP', 'SYM', 'TO', 'UH', 'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ', 'WDT', 'WP', 'WP$', 'WRB', '``']
	Classes in Training set for which there are no samples in Validation set: ['SYM']
	Classes in Training set for which there are no samples in Test set: ['#', 'FW', 'LS', 'SYM', 'UH']

Validation tags number: 44
Validation tags list: ['#', '$', "''", ',', '-LRB-', '-RRB-', '.', ':', 'CC', 'CD', 'DT', 'EX', 'FW', 'IN', 'JJ', 'JJR', 'JJS', 'LS', 'MD', 'NN', 'NNP', 'NNPS', 'NNS', 'PDT', 'POS', 'PRP', 'PRP$', 'RB', 'RBR', 'RBS', 'RP', 'TO', 'UH', 'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ', 'WDT', 'WP', 'WP$', 'WRB', '``']
	Classes in Validation set for which there are no samples in Test set: ['#', 'FW', 'LS', 'UH']

Test tags number: 40
Test tags list: ['$', "'

The most frequent tags in each set are similar, with Nouns (NN) being the most frequent, followed by prepositions (IN), determiners (DT), and proper nouns (NNP), this suggests that nouns and prepositions are the most frequent parts of speech in the text, and that the datasets are similar in terms of the distribution of tags.

The analysis of these distributions will be crucial in setting the weights to counteract the unbalanceness of the datasets while looking at the metrics.

In [11]:
# Retriving prepocessed data and grouping in sentences
X_train_raw = train_df.groupby('sentence').word.apply(list).reset_index()['word']
X_val_raw = val_df.groupby('sentence').word.apply(list).reset_index()['word']
X_test_raw = test_df.groupby('sentence').word.apply(list).reset_index()['word']

y_train_raw = train_df.groupby('sentence').tag.apply(list).reset_index()['tag']
y_val_raw = val_df.groupby('sentence').tag.apply(list).reset_index()['tag']
y_test_raw = test_df.groupby('sentence').tag.apply(list).reset_index()['tag']

In [12]:
# Plot tag distributions per sentence
plot_tag_distribution(y_train_raw,'Training')
plot_tag_distribution(y_val_raw,'Validation')
plot_tag_distribution(y_test_raw,'Test')

As expected, looking at the tag distribution per sentence plots, the minority classes in the dataset have occurences only in some sentences in all the three datasets.

##-Vocabulary part-

GloVe Vocabulary (V1)

In [13]:
#Use the 300 dimensional GLove Word Embeddings
glove_dir = './'
embedding_dim = 300
embedding_dict = {} #initialize dictionary
f = open(os.path.join(glove_dir, f'glove.6B.{embedding_dim}d.txt'), encoding="utf8")
lines = f.readlines()
f.close()

pbar = progressbar.ProgressBar()
for line in pbar(lines):
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embedding_dict[word] = coefs

print('Found %s word vectors.' % len(embedding_dict))

# Computing mean embeddings per tag
tag_dict = mean_embed4tag(train_df, tags_train, embedding_dict, embedding_dim)

counts = []

100% (400000 of 400000) |################| Elapsed Time: 0:00:23 Time:  0:00:23


Found 400000 word vectors.
Computed mean embeddings for 45 tags.


V1 + Training set OOV (V2)

In [14]:
# Computing the embeddings for the OOV words found in training set
embedding_dict, counts_1 = update_vocab(train_df,embedding_dict,tag_dict,embedding_dim)
counts.append(counts_1)

Added 363 OOV words + respective embeddings to the vocabulary.
Added 1983 Capitalized words + respective embeddings to the vocabulary.


V2 + Validation set OOV (V3)

In [15]:
# Computing the embeddings for the OOV words found in validation set
embedding_dict, counts_2 = update_vocab(val_df,embedding_dict,tag_dict,embedding_dim)
counts.append(counts_2)

Added 190 OOV words + respective embeddings to the vocabulary.
Added 754 Capitalized words + respective embeddings to the vocabulary.


V3 + Test set OOV (V4)

In [16]:
# Computing the embeddings for the OOV words found in test set
embedding_dict, counts_3 = update_vocab(test_df,embedding_dict,tag_dict,embedding_dim)
counts.append(counts_3)

Added 129 OOV words + respective embeddings to the vocabulary.
Added 326 Capitalized words + respective embeddings to the vocabulary.


In [17]:
# Building the actual word vocabulary
index2word = OrderedDict()
word2index = OrderedDict()

# Adding the entry for padding
index2word[0] = '-PAD-'
word2index['-PAD-'] = 0

curr_idx = 1
for key in embedding_dict.keys():
  word2index[key] = curr_idx
  index2word[curr_idx] = key
  curr_idx += 1

vocab_length = len(word2index) 
print(f'[Debug] Index -> Word vocabulary size: {len(index2word)}')
print(f'[Debug] Word -> Index vocabulary size: {len(word2index)}')


counts_sum = [sum(x) for x in zip(*counts)]
counts_sum[2] = vocab_length - counts_sum[0] - counts_sum[1] -1
fig = px.pie(values=counts_sum, names=['OOV words','Cap. words','IV words'])
fig.show()

[Debug] Index -> Word vocabulary size: 403746
[Debug] Word -> Index vocabulary size: 403746


In [18]:
# Tag vocabulary

tag2index = OrderedDict()
index2tag = OrderedDict()

# Adding the entry for padding
index2tag[0] = '-PAD-'
tag2index['-PAD-'] = 0

curr_id = 1
for tag in tags_train:
  tag2index[tag] = curr_id
  index2tag[curr_id] = tag
  curr_id += 1

print(f'[Debug] Index -> Tag vocabulary size: {len(index2tag)}')
print(f'[Debug] Tag -> Index vocabulary size: {len(tag2index)}')


[Debug] Index -> Tag vocabulary size: 46
[Debug] Tag -> Index vocabulary size: 46


In [19]:
# Tokenising words and tags by their indexes in vocabulary
X_train_np, y_train_np = encode_sentences(X_train_raw,y_train_raw,word2index,tag2index)
X_val_np, y_val_np = encode_sentences(X_val_raw,y_val_raw,word2index,tag2index) 
X_test_np, y_test_np = encode_sentences(X_test_raw,y_test_raw,word2index,tag2index) 

# Examples
print('-Not encoded')
print('\t',X_train_raw[0]) 
print('\t',y_train_raw[0])
print('-Encoded')
print('\t',X_train_np[0])
print('\t',y_train_np[0])

-Not encoded
	 ['Pierre', 'Vinken', ',', '61', 'years', 'old', ',', 'will', 'join', 'the', 'board', 'as', 'a', 'nonexecutive', 'director', 'Nov.', '29', '.']
	 ['NNP', 'NNP', ',', 'CD', 'NNS', 'JJ', ',', 'MD', 'VB', 'DT', 'NN', 'IN', 'DT', 'JJ', 'NN', 'NNP', 'CD', '.']
-Encoded
	 [400001, 400002, 2, 4979, 83, 168, 2, 44, 1430, 1, 535, 20, 8, 128565, 370, 400003, 1264, 3]
	 [21, 21, 4, 10, 23, 15, 4, 19, 35, 11, 20, 14, 11, 15, 20, 21, 10, 7]


In [20]:
# Checking the lengths of the sentences
lengths = [len(sentence) for sentence in X_train_raw]
lengths.sort()

# Showing a boxplot of the lengths of the sentences
fig = px.box(lengths)
fig.update_layout(xaxis_title='',
                  yaxis_title='',
                  title='Words per sentence')
fig.show()

In [21]:
# Maximum words in a sentence
MAX_LENGTH = lengths[-1] 
# Second longest sentence 
PAD_LENGTH = lengths[-2] 

print(f'Length of longest sentence: {MAX_LENGTH}')
print(f'Second longest sentence length: {PAD_LENGTH}')

# Padding the sequences
X_train = pad_sequences(X_train_np, maxlen=PAD_LENGTH, padding='post')
X_val = pad_sequences(X_val_np, maxlen=PAD_LENGTH, padding='post')
X_test = pad_sequences(X_test_np, maxlen=PAD_LENGTH, padding='post')

y_train = pad_sequences(y_train_np, maxlen=PAD_LENGTH, padding='post')
y_val = pad_sequences(y_val_np, maxlen=PAD_LENGTH, padding='post')
y_test = pad_sequences(y_test_np, maxlen=PAD_LENGTH, padding='post')

print('-Padded')
print('\tX:',X_train[0])
print('\n\ty:',y_train[0])

Length of longest sentence: 249
Second longest sentence length: 114
-Padded
	X: [400001 400002      2   4979     83    168      2     44   1430      1
    535     20      8 128565    370 400003   1264      3      0      0
      0      0      0      0      0      0      0      0      0      0
      0      0      0      0      0      0      0      0      0      0
      0      0      0      0      0      0      0      0      0      0
      0      0      0      0      0      0      0      0      0      0
      0      0      0      0      0      0      0      0      0      0
      0      0      0      0      0      0      0      0      0      0
      0      0      0      0      0      0      0      0      0      0
      0      0      0      0      0      0      0      0      0      0
      0      0      0      0      0      0      0      0      0      0
      0      0      0      0]

	y: [21 21  4 10 23 15  4 19 35 11 20 14 11 15 20 21 10  7  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0 

In [22]:
# One hot encoding the sets
y_train_one_hot = to_categorical(y_train, len(tag2index))
y_val_one_hot = to_categorical(y_val, len(tag2index))
y_test_one_hot = to_categorical(y_test, len(tag2index))

## 2. GloVe 
GloVe (Global Vectors for Word Representation) is a method for learning vector representations of words, called "word embeddings," from a large corpus of text. Word embeddings are numerical representations of words that capture the semantic relationships between words in a continuous, low-dimensional space. They are commonly used as input to natural language processing models, such as language translation and language modeling.

GloVe works by learning the co-occurrence statistics of words in a corpus, and using this information to learn word embeddings that capture the semantic relationships between words. The GloVe method produces word embeddings that are trained on a global corpus, as opposed to embeddings that are trained on a specific task or dataset.

There are different versions of the GloVe word embeddings, including 50-dimensional, 100-dimensional, and 200-dimensional embeddings. The 50-dimensional version of GloVe embeddings may be better in some applications because they have a lower dimensionality, which can make them easier to work with and more computationally efficient.

By using GloVe embeddings as the initial weights for a model, we can take advantage of these pre-trained word representations and fine-tune them for a specific task.

In [23]:
#Building the Embedding Layer 
embedding_matrix = np.zeros((len(word2index), embedding_dim))
for word, i in word2index.items():
  if word != '-PAD-':
    embedding_vector = embedding_dict.get(word)
    embedding_matrix[i] = embedding_vector

In [24]:
if 'embedding_dict' in globals():
    del embedding_dict
    print(f'Garbage Collection: {gc.collect()}')

Garbage Collection: 6994


## 3. Model
### 3.1 Baseline 
Bidirectional LSTM layers are able to process sequential data in both the forward and backward directions, which can allow the model to capture contextual information from both the past and the future. This can be particularly useful for natural language processing tasks, where the meaning of a word can depend on the context in which it is used.

In the context of POS tagging, TimeDistributed can be used to apply a tag prediction layer to each word in a sentence. For example, you might have an RNN that processes a sequence of words in a sentence, and at each time step, the RNN outputs a hidden state. You could then apply a TimeDistributed dense layer to the hidden states, which would allow you to predict the POS tag for each word in the sentence.

One advantage of using TimeDistributed for POS tagging is that it allows you to predict the POS tag for each word in the sentence simultaneously, rather than having to process the sentence one word at a time. This can be particularly useful when dealing with long sentences, as it can make the tagging process more efficient.

Overall, using TimeDistributed for POS tagging can help you build more accurate and efficient models for natural language processing tasks that involve sequential data.

In [25]:
# Tags to ignore from the metrics
ignore = [':', '#', '$', '-LRB-', '-RRB-', ',', '.', "''", '``', 'SYM','LS','-PAD-']

# Computing the weights for the classes
weights = compute_weights(train_df,tag2index)

# Custom metrics ignoring classes
w_accuracy = ignore_class_accuracy(weights,[tag2index[tag] for tag in ignore])
metrics = [w_accuracy]

# Embedding layer parameters
embedding_params = {'input_dim': vocab_length,'output_dim': embedding_dim,
                    'weights': [embedding_matrix],'input_length': PAD_LENGTH}
# Callbacks
callbacks = [
    
    tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True,verbose=True),
    tf.keras.callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.33, patience=2, verbose=True, min_lr=0.001)
]

# Seeds
seeds = [3,42,192]

Weights: 
[0.99, 0.9999788833516344, 0.9927781062589746, 0.991574457302137, 0.9457090970521159, 0.9989019342849903, 0.9988385843398936, 0.9586324858518456, 0.9938128220288875, 0.975905904214883, 0.9696131430019427, 0.9139285412619309, 0.998965284230087, 0.9999577667032689, 0.8954303572936904, 0.9368189880902104, 0.9966846862066053, 0.9980361517020019, 0.9997888335163443, 0.991278824225019, 0.8675986147478673, 0.8901511952022975, 0.9979939184052707, 0.9365655883098235, 0.9998099501647099, 0.9914899907086747, 0.9798547174592449, 0.9913632908184813, 0.9685361939352986, 0.9981839682405609, 0.9995987836810541, 0.99704366922882, 0.9999788833516344, 0.9782920854801925, 0.9999788833516344, 0.9747656052031421, 0.9673114283300954, 0.983887997297069, 0.9782287355350958, 0.9846481966382296, 0.9760748374018076, 0.9956922037334235, 0.9970225525804545, 0.9998733001098066, 0.9980572683503675, 0.9913632908184813]


In [26]:
# Learning Rate
LR = 0.03

baseline_layer_params = [{'layer_type': 'Bidirectional',
                          'layer_kwargs': {'units': 128}},
                         {'layer_type': 'Dense',
                          'layer_kwargs':{'units': len(tag2index),'activation': 'softmax'}}]

bl_training_params = {'x': X_train, 'y': y_train_one_hot, 'validation_data': (X_val, y_val_one_hot),
                  'batch_size': 32, 'epochs': 15, 'callbacks': callbacks} 


baseline_model_recaps = run_models('Baseline',baseline_layer_params,embedding_params,bl_training_params,metrics,LR,seeds)


Running with seed: 3
Model: "Baseline_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 114, 300)          121123800 
                                                                 
 bidirectional (Bidirectiona  (None, 114, 256)         439296    
 l)                                                              
                                                                 
 time_distributed (TimeDistr  (None, 114, 46)          11822     
 ibuted)                                                         
                                                                 
Total params: 121,574,918
Trainable params: 451,118
Non-trainable params: 121,123,800
_________________________________________________________________

Fitting the Baseline (seed 3) model...
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 6: ReduceLROnPlateau reducing learn

In [27]:
mean_metrics_bl = mean_metrics(baseline_model_recaps)

Mean values: {'macro_f1': 0.703175334502025, 'weighted_ignore_accuracy': 0.8508925040562948}


Different combinations of hyperparameters have been taken in consideration during the tuning phase.

The most evident changes in the scores were due to the `units` used in the LSTM layer and the `batch_size`:

*  Generally, a larger number of units in the LSTM layer means that the model has more capacity to learn complex representations. However, having too many units can lead to overfitting, where the model memorizes the training data instead of learning general patterns. On the other hand, having too few units can lead to underfitting, where the model is not able to capture important features of the data.
In our case, using fewer units (from 256 to 128) may be helping to prevent the model from memorizing the training data, resulting in a more generalizable model that is better able to generalize to new data. 

*   The reason for seeing better results with a batch size of `32` compared to bigger sizes, could be due to a phenomenon called "*batch normalization instability*". Larger batch sizes can lead to a higher variance in the estimated mean and variance used in batch normalization, making the normalization less stable and leading to worse results. Smaller batch sizes, on the other hand, can provide a more stable estimate of the mean and variance and lead to improved results.
Using an unbalanced dataset like ours can lead to a bias towards the class with more samples. When the batch size is small, the model is more likely to see a diverse range of samples in each batch, which can help mitigate the impact of the class imbalance. On the other hand, if the batch size is too large, the model may not see enough samples from the minority class to learn to accurately classify them. In such cases, using a smaller batch size could help alleviate the problem and improve performance.





### 3.2 GRU 
Gated Recurrent Units (GRUs) are a type of recurrent neural network (RNN) that are often used in natural language processing tasks such as part-of-speech (POS) tagging. GRUs are similar to long short-term memory (LSTM) networks, but they have a simpler structure and fewer parameters, making them easier to train and faster to run. In POS tagging, GRUs can be used to process a sequence of words and predict the POS tags for each word in the sequence. GRUs are able to take into account contextual information from the previous words in the sequence, allowing them to make more accurate predictions about the POS tags for the current word. 

................

In [28]:
# Learning Rate
LR = 0.03

gru_layer_params = [{'layer_type': 'GRU',
                     'layer_kwargs': {'units': 128}},
                    {'layer_type': 'Dense',
                     'layer_kwargs':{'units': len(tag2index),'activation': 'softmax'}}]

gru_training_params = {'x': X_train, 'y': y_train_one_hot, 'validation_data': (X_val, y_val_one_hot),
                  'batch_size': 32, 'epochs': 15, 'callbacks': callbacks} 

gru_model_recaps = run_models('GRU',gru_layer_params,embedding_params,gru_training_params,metrics,LR,seeds)

Running with seed: 3
Model: "GRU_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 114, 300)          121123800 
                                                                 
 gru (GRU)                   (None, 114, 128)          165120    
                                                                 
 time_distributed (TimeDistr  (None, 114, 46)          5934      
 ibuted)                                                         
                                                                 
Total params: 121,294,854
Trainable params: 171,054
Non-trainable params: 121,123,800
_________________________________________________________________

Fitting the GRU (seed 3) model...
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 5: ReduceLROnPlateau reducing learning rate to 0.009899999778717757.
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 9: 

In [29]:
mean_metrics_gru = mean_metrics(gru_model_recaps)

Mean values: {'macro_f1': 0.6389822757717348, 'weighted_ignore_accuracy': 0.8349865873654684}


As it's noticeable by the scores obtained, the simple LSTM architecture used as baseline perform better than this (equally simple) one.
 
LSTM has a more complex structure compared to GRU, which allows it to model more complex dependencies in the data. This makes LSTM more suitable for tasks like POS-tagging where the relationships between words can be very complex.

The results may also be affected by the suitability of the data for the GRU architecture. If the data contains long-term dependencies that are better captured by LSTM, then the GRU architecture may not perform as well.

### 3.3 Additional LSTM layer 
The second biLSTM layer increases the capacity of the network, allowing it to model more complex dependencies in the data. This can lead to better performance on tasks like POS-tagging where the relationships between words can be very complex. Moreover, the second biLSTM layer can also help improve representation learning by capturing higher-level abstractions of the input data. This can lead to better generalization and improved performance on the target task.

The biLSTM architecture is known to be good at handling long-term dependencies, and adding a second biLSTM layer can help further improve this ability. This can be especially beneficial for tasks like POS-tagging where the relationships between words can span multiple time steps.

However, it's also possible that adding a second biLSTM layer could lead to overfitting, especially if the model is already sufficiently large to model the data. 

In [30]:
LR = 0.01

add_lstm_layer_params = [{'layer_type': 'Bidirectional',
                          'layer_kwargs': {'units': 128, 'dropout': 0.2}},
                         {'layer_type': 'Bidirectional',
                          'layer_kwargs': {'units': 128, 'recurrent_dropout': 0.2}},
                         {'layer_type': 'Dense',
                          'layer_kwargs':{'units': len(tag2index),'activation': 'softmax'}}]

add_lstm_training_params = {'x': X_train, 'y': y_train_one_hot, 'validation_data': (X_val, y_val_one_hot),
                  'batch_size': 32, 'epochs': 20, 'callbacks': callbacks} 

add_lstm_model_recaps = run_models('Additional_LSTM',add_lstm_layer_params,embedding_params,add_lstm_training_params,metrics,LR,seeds)

Running with seed: 3




Model: "Additional_LSTM_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 114, 300)          121123800 
                                                                 
 bidirectional (Bidirectiona  (None, 114, 256)         439296    
 l)                                                              
                                                                 
 bidirectional_1 (Bidirectio  (None, 114, 256)         394240    
 nal)                                                            
                                                                 
 time_distributed (TimeDistr  (None, 114, 46)          11822     
 ibuted)                                                         
                                                                 
Total params: 121,969,158
Trainable params: 845,358
Non-trainable params: 121,123,800
_____________________________



Model: "Additional_LSTM_42"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 114, 300)          121123800 
                                                                 
 bidirectional (Bidirectiona  (None, 114, 256)         439296    
 l)                                                              
                                                                 
 bidirectional_1 (Bidirectio  (None, 114, 256)         394240    
 nal)                                                            
                                                                 
 time_distributed (TimeDistr  (None, 114, 46)          11822     
 ibuted)                                                         
                                                                 
Total params: 121,969,158
Trainable params: 845,358
Non-trainable params: 121,123,800
____________________________



Model: "Additional_LSTM_192"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 114, 300)          121123800 
                                                                 
 bidirectional (Bidirectiona  (None, 114, 256)         439296    
 l)                                                              
                                                                 
 bidirectional_1 (Bidirectio  (None, 114, 256)         394240    
 nal)                                                            
                                                                 
 time_distributed (TimeDistr  (None, 114, 46)          11822     
 ibuted)                                                         
                                                                 
Total params: 121,969,158
Trainable params: 845,358
Non-trainable params: 121,123,800
___________________________

In [31]:
mean_metrics_add_lstm = mean_metrics(add_lstm_model_recaps)

Mean values: {'macro_f1': 0.7415492252982498, 'weighted_ignore_accuracy': 0.8608795801798502}


Due to the constraints to the layers in our architecture, a Dropout layer was not applicable. In that case, the dropout is applied after the current layer's output, i.e., the dropout is applied to the activations of the LSTM layer. This means that the dropout rate you specify for the Dropout layer will apply to the output of the LSTM layer, and some of the activations will be set to zero during each forward pass with a probability specified by the dropout rate.

Instead, when using the `dropout` parameter in the LSTM layer, the dropout is applied to the input of the LSTM layer. This means that the dropout rate specified for the `dropout` parameter will apply to the input connections, and some of the connections will be set to zero during each forward pass with a probability specified by the dropout rate.

Similarly works the `recurrent_dropout` parameter, but it applies dropout to the recurrent connections of the LSTM layer, rather than the input connections.

Using both the parameters in an LSTM layer can be effective in preventing overfitting and improving the generalization of the network, but they can slow down the convergence of the network, making it more difficult to train and can lead to an increase in the variance of the gradients, which can make the training process more unstable.

In our case, by setting low values (0.1) to both `dropout` and `recurrent_dropout` worked the best in terms of macro f1 scores obtained.

### 3.4 Additional dense layer

Using two dense layers, one with a non-linear activation function and one with a softmax activation function, is a common pattern in neural network architectures for classification tasks.

The purpose of the non-linear dense layer is to introduce non-linearity into the model, which can allow the model to learn more complex patterns in the data. Common choices for the activation function in this layer include ReLU (Rectified Linear Unit) or (sigmoid/tanh) as activation functions.

The purpose of the softmax dense layer is to produce a probability distribution over the possible classes. The softmax activation function transforms the output of the preceding layer into a probability distribution, where the sum of the probabilities is equal to 1. This is useful for classification tasks, where you want to predict the probability that an input belongs to each of the possible classes. Using two dense layers in this way can allow the model to learn more complex patterns in the data and make more accurate predictions.

In [32]:
LR = 0.01

add_fc_layer_params = [{'layer_type': 'Bidirectional',
                          'layer_kwargs': {'units': 128, 'dropout': 0.2}},
                         {'layer_type': 'Dense',
                          'layer_kwargs':{'units': PAD_LENGTH,'activation': 'relu'}},
                         {'layer_type': 'Dense',
                          'layer_kwargs':{'units': len(tag2index),'activation': 'softmax'}}]

add_fc_training_params = {'x': X_train, 'y': y_train_one_hot, 'validation_data': (X_val, y_val_one_hot),
                          'batch_size': 32, 'epochs': 25, 'callbacks': callbacks} 

add_fc_model_recaps = run_models('Additional_FC',add_fc_layer_params,embedding_params,add_fc_training_params,metrics,LR,seeds)

Running with seed: 3
Model: "Additional_FC_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 114, 300)          121123800 
                                                                 
 bidirectional (Bidirectiona  (None, 114, 256)         439296    
 l)                                                              
                                                                 
 time_distributed (TimeDistr  (None, 114, 114)         29298     
 ibuted)                                                         
                                                                 
 time_distributed_1 (TimeDis  (None, 114, 46)          5290      
 tributed)                                                       
                                                                 
Total params: 121,597,684
Trainable params: 473,884
Non-trainable params: 121,123,800
__________

In [33]:
mean_metrics_add_fc = mean_metrics(add_fc_model_recaps)

Mean values: {'macro_f1': 0.7002543599985797, 'weighted_ignore_accuracy': 0.8567142685254415}


## 4. Comparisons


### Val Weighted Ignore Accuracy during training 

In [51]:
plot_train(baseline_model_recaps,gru_model_recaps,add_lstm_model_recaps,add_fc_model_recaps,metric='val_weighted_ignore_accuracy')

### Average Macro F1 score

In [52]:
# Sample data
data = [mean_metrics_bl['macro_f1'], mean_metrics_gru['macro_f1'],
        mean_metrics_add_lstm['macro_f1'], mean_metrics_add_fc['macro_f1']]

# Create figure
fig = go.Figure(data=[go.Bar(x=['Baseline','GRU', 'Add. LSTM', 'Add. FC'], y=data)])

# Set titles for axes
fig.update_layout(xaxis_title='Model type', yaxis_title='Mean Macro F1')

# Show plot
fig.show()


### Macro F1 scores and predictions on test set

#### Additional LSTM layer model

In [53]:
add_lstm_test, predictions_add_lstm = test_f(add_lstm_model_recaps,X_test,X_test_np,y_test,y_test_np,tag2index,index2tag,index2word,ignore)

Evaluating model 1 with the Test set:
Macro f1 score for model 1: 0.8345913464793259

Evaluating model 2 with the Test set:
Macro f1 score for model 2: 0.7446747252841209

Evaluating model 3 with the Test set:
Macro f1 score for model 3: 0.7808048259749281

Mean macro f1 score on test set: 0.7866902992461249

Test sentence: ['Any', 'question', 'as', 'to', 'why', 'an', 'author', 'would', 'believe', 'this', 'plaintive', ',', 'high-minded', 'note', 'of', 'assurance', 'is', 'necessary', 'is', 'answered', 'by', 'reading', 'this', 'book', 'about', 'sticky', 'fingers', 'and', 'sweaty', 'scammers', '.']
Ground truth tags: ['DT', 'NN', 'IN', 'TO', 'WRB', 'DT', 'NN', 'MD', 'VB', 'DT', 'JJ', ',', 'JJ', 'NN', 'IN', 'NN', 'VBZ', 'JJ', 'VBZ', 'VBN', 'IN', 'VBG', 'DT', 'NN', 'IN', 'JJ', 'NNS', 'CC', 'JJ', 'NNS', '.']
Predictions: ['DT', 'NN', 'IN', 'TO', 'VB', 'DT', 'NN', 'MD', 'VB', 'DT', 'JJ', ',', 'JJ', 'NN', 'IN', 'NN', 'VBZ', 'JJ', 'VBZ', 'VBN', 'IN', 'VBG', 'DT', 'NN', 'IN', 'JJ', 'NNS', 'CC', 

#### Additional Dense layer model

In [54]:
add_fc_test, predictions_add_fc = test_f(add_fc_model_recaps,X_test,X_test_np,y_test,y_test_np,tag2index,index2tag,index2word,ignore)

Evaluating model 1 with the Test set:
Macro f1 score for model 1: 0.7549462065923791

Evaluating model 2 with the Test set:
Macro f1 score for model 2: 0.7824158499053713

Evaluating model 3 with the Test set:
Macro f1 score for model 3: 0.7553212077931631

Mean macro f1 score on test set: 0.7642277547636378

Test sentence: ['West', 'Texas', 'Intermediate', 'for', 'December', 'delivery', 'advanced', '22', 'cents', 'to', '$', '19.94', 'a', 'barrel', '.']
Ground truth tags: ['NNP', 'NNP', 'NNP', 'IN', 'NNP', 'NN', 'VBD', 'CD', 'NNS', 'TO', '$', 'CD', 'DT', 'NN', '.']
Predictions: ['NNP', 'NNP', 'NNP', 'IN', 'NNP', 'NN', 'NNP', 'CD', 'NNS', 'TO', '$', 'CD', 'DT', 'NN', '.']


###Test the model with a keyboard-input sentence

In [56]:
tokenizer = tf.keras.preprocessing.text.Tokenizer(filters='', split=' ', lower=False, char_level=False, oov_token=False)
tokenizer.word_index = word2index

sentence = input("Please enter a sentence: ")
encoded_sentence = tokenizer.texts_to_sequences([sentence])
padded_sentence = pad_sequences(sequences=encoded_sentence,dtype='float32',maxlen=PAD_LENGTH, padding='post')
prediction = add_fc_model_recaps[0]["model"].predict(padded_sentence)
prediction = np.argmax(prediction, axis=-1)

print(f'Sentence: {[index2word[word] for word in encoded_sentence[0]]}')
print(f'Predicted pos tags: {[index2tag[tag] for tag in prediction[0] if index2tag[tag] != "-PAD-"]}') 



Please enter a sentence: I am happy
Sentence: ['I', 'am', 'happy']
Predicted pos tags: ['PRP', 'JJ', 'JJ']
