# Assignment 1

**Due to**: 11/01/2022 (dd/mm/yyyy)

If you deliver it by 11/12/2021 your assignment will be graded by 11/01/2022.


**Credits**: Andrea Galassi, Federico Ruggeri, Paolo Torroni

**Summary**: Part-of Speech (POS) tagging as Sequence Labelling using Recurrent Neural Architectures

# Execution
https://www.kaggle.com/code/tanyadayanand/pos-tagging-using-rnn

A bunch of libraries and functions that will be used throughout the notebook.


In [1]:
import re
import pandas as pd
import numpy as np
import os
import urllib.request
import zipfile
import progressbar

import nltk
import sklearn
from keras.preprocessing.text import Tokenizer
from keras_preprocessing.sequence import pad_sequences
from keras.utils.np_utils import to_categorical

import tensorflow as tf
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dense, GRU, TimeDistributed
from sklearn.metrics import f1_score

nltk.download('treebank')
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package treebank to /root/nltk_data...
[nltk_data]   Unzipping corpora/treebank.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [2]:
def pre_process(df,string):
  def text_pre_process(text):
      ret = re.sub("RT @(.)+?:\s|(&#[0-9]+;)|@([\w\-]+)|(#)\S+|(http)s?\S+|&gt;|^\s+|\b\s+|\n", "", text)
      ret = re.sub("\s\s+|[^a-zA-Z\d\s:]" , " ", ret).rstrip().lower()
      return ret
  return df[string].apply(text_pre_process)

def create_embedding_matrix(filepath, word_index, embedding_dim):
  vocab_size = len(word_index)+1
  embedding_matrix = np.zeros((vocab_size,embedding_dim))

  with open(filepath, encoding='utf-8') as f:
    for line in f:
      word, *vector = line.split()
      if word in word_index:
        idx = word_index[word]
        embedding_matrix[idx] = np.array(vector, dtype=np.float32)[:embedding_dim]
  return embedding_matrix

## 1. Corpus
### 1.1 Pre-processing

From the original tags list we removed all the symbols and english punctuation plus:
- FW, Foreign Word, because there are no examples in the test set;
- UH, Interjection, because there are no examples in the test set;
- LS, List Item Marker, because there are no examples in the test set (and because it denotes symbols as well);

In [3]:
# Get the files' list
fileids = nltk.corpus.treebank.fileids()

# Get the Penn Treebank corpus and tokenize the text
train_corpus = nltk.corpus.treebank.tagged_sents(fileids[:100])
val_corpus = nltk.corpus.treebank.tagged_sents(fileids[100:150])
test_corpus = nltk.corpus.treebank.tagged_sents(fileids[150:])

# Flatten the lists
remove = [':', '#', '"', '$', '-LRB-', '-RRB-', ',', '.', "''", '``', 'SYM', '-NONE-']

train_corpus = [tuple(list(item)+[str(idx)]) for idx,sublist in enumerate(train_corpus) for item in sublist if item[1] not in remove]
val_corpus = [tuple(list(item)+[str(idx)]) for idx,sublist in enumerate(val_corpus) for item in sublist if item[1] not in remove]
test_corpus = [tuple(list(item)+[str(idx)]) for idx,sublist in enumerate(test_corpus) for item in sublist if item[1] not in remove]

In [4]:
train_df = pd.DataFrame(train_corpus, columns = ['word', 'tag', 'sentence'])
# train_df['word'] = pre_process(train_df,'word')

print(train_df.shape)
train_df.describe()

(41274, 3)


Unnamed: 0,word,tag,sentence
count,41274,41274,41274
unique,7989,35,1963
top,the,NN,1854
freq,1981,6270,171


In [5]:
val_df = pd.DataFrame(val_corpus, columns = ['word', 'tag', 'sentence'])
# val_df['word'] = pre_process(val_df,'word')

print(val_df.shape)
val_df.describe()

(27418, 3)


Unnamed: 0,word,tag,sentence
count,27418,27418,27418
unique,5873,35,1299
top,the,NN,339
freq,1429,4513,75


In [6]:
test_df = pd.DataFrame(test_corpus, columns = ['word', 'tag', 'sentence'])
# test_df['word'] = pre_process(test_df,'word')

print(test_df.shape)
test_df.describe()

(13676, 3)


Unnamed: 0,word,tag,sentence
count,13676,13676,13676
unique,3608,32,652
top,the,NN,231
freq,635,2383,51


In [7]:
tags_train = sorted(list(set([x for x in train_df.tag])))
tags_val = sorted(list(set([x for x in val_df.tag])))
tags_test = sorted(list(set([x for x in test_df.tag])))

print('Train tags number:',len(tags_train))
print('Val tags number:',len(tags_val))
print('Test tags number:',len(tags_test))

if len(tags_test) != len(tags_val) or len(tags_test) != len(tags_train):
  print('\nMismatching numbers.')
  print('Removing extra classes:')

  missing_classes_train = [x for x in tags_train if x not in tags_test]
  missing_classes_val   = [x for x in tags_val if x not in tags_test]

  missing_classes = list(set(missing_classes_train + missing_classes_val))
  print(missing_classes)

  for cl in missing_classes:
    train_df = train_df[train_df.tag != cl]
    val_df = val_df[val_df.tag != cl]

  tags_train = sorted(list(set([x for x in train_df.tag])))
  tags_val = sorted(list(set([x for x in val_df.tag])))
  tags_test = sorted(list(set([x for x in test_df.tag])))

  print('\nNew Train tags number:',len(tags_train))
  print('New Val tags number:',len(tags_val))
  print('New Test tags number:',len(tags_test))

print('\nTags:')
for tag in tags_train:
  print(f'-{tag}')

Train tags number: 35
Val tags number: 35
Test tags number: 32

Mismatching numbers.
Removing extra classes:
['UH', 'FW', 'LS']

New Train tags number: 32
New Val tags number: 32
New Test tags number: 32

Tags:
-CC
-CD
-DT
-EX
-IN
-JJ
-JJR
-JJS
-MD
-NN
-NNP
-NNPS
-NNS
-PDT
-POS
-PRP
-PRP$
-RB
-RBR
-RBS
-RP
-TO
-VB
-VBD
-VBG
-VBN
-VBP
-VBZ
-WDT
-WP
-WP$
-WRB


In [8]:
X_train_raw = train_df.groupby('sentence').word.apply(list).reset_index()['word']
y_train_raw = train_df.groupby('sentence').tag.apply(list).reset_index()['tag']

X_val_raw = val_df.groupby('sentence').word.apply(list).reset_index()['word']
y_val_raw = val_df.groupby('sentence').tag.apply(list).reset_index()['tag']

X_test_raw = test_df.groupby('sentence').word.apply(list).reset_index()['word']
y_test_raw = test_df.groupby('sentence').tag.apply(list).reset_index()['tag']

X = [*X_train_raw, *X_val_raw, *X_test_raw]
y = [*y_train_raw, *y_val_raw, *y_test_raw]

In [9]:
# encode X

word_tokenizer = Tokenizer()
word_tokenizer.fit_on_texts(X)

num_words = 9000
word_tokenizer.word_index = {e:i for e,i in word_tokenizer.word_index.items() if i <= num_words}
word_tokenizer.word_index[word_tokenizer.oov_token] = num_words + 1

X_train = word_tokenizer.texts_to_sequences(X_train_raw)
X_val = word_tokenizer.texts_to_sequences(X_val_raw)
X_test = word_tokenizer.texts_to_sequences(X_test_raw)

vocab_size = len(word_tokenizer.word_index) + 1

In [10]:
# encode Y

tag_tokenizer = Tokenizer()
tag_tokenizer.fit_on_texts(y)

y_train = tag_tokenizer.texts_to_sequences(y_train_raw)
y_val = tag_tokenizer.texts_to_sequences(y_val_raw)
y_test = tag_tokenizer.texts_to_sequences(y_test_raw)

In [11]:
print('-Not encoded')
print('\t',X_train_raw[0]) 
print('\t',y_train_raw[0])
print('-Encoded')
print('\t',X_train[0])
print('\t',y_train[0])

-Not encoded
	 ['Pierre', 'Vinken', '61', 'years', 'old', 'will', 'join', 'the', 'board', 'as', 'a', 'nonexecutive', 'director', 'Nov.', '29']
	 ['NNP', 'NNP', 'CD', 'NNS', 'JJ', 'MD', 'VB', 'DT', 'NN', 'IN', 'DT', 'JJ', 'NN', 'NNP', 'CD']
-Encoded
	 [5398, 3694, 1987, 70, 305, 34, 2366, 1, 112, 17, 4, 1988, 306, 433, 1989]
	 [3, 3, 7, 5, 6, 18, 10, 4, 1, 2, 4, 6, 1, 3, 7]


In [12]:
# check length of longest sentence
lengths = [len(seq) for seq in X_train+X_test+X_val]
print("Length of longest sentence: {}".format(max(lengths)))

max_len = max(lengths)
X_train = pad_sequences(X_train,padding='post',maxlen=max_len)
X_val = pad_sequences(X_val,padding='post',maxlen=max_len)
X_test = pad_sequences(X_test,padding='post',maxlen=max_len)

y_train = pad_sequences(y_train,padding='post',maxlen=max_len)
y_val = pad_sequences(y_val,padding='post',maxlen=max_len)
y_test = pad_sequences(y_test,padding='post',maxlen=max_len)

print('-Padded')
print('\tX:',X_train[0])
print('\n\ty:',y_train[0])

Length of longest sentence: 171
-Padded
	X: [5398 3694 1987   70  305   34 2366    1  112   17    4 1988  306  433
 1989    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0]

	y: [ 3  3  7  5  6 18 10  4  1  2  4  6  1  3  7  0  0  0  0  0  0  0  0  0
  0  0  0

In [13]:
y_train = to_categorical(y_train)
y_val = to_categorical(y_val)
y_test = to_categorical(y_test)

## 2. GloVe 
GloVe (Global Vectors for Word Representation) is a method for learning vector representations of words, called "word embeddings," from a large corpus of text. Word embeddings are numerical representations of words that capture the semantic relationships between words in a continuous, low-dimensional space. They are commonly used as input to natural language processing models, such as language translation and language modeling.

GloVe works by learning the co-occurrence statistics of words in a corpus, and using this information to learn word embeddings that capture the semantic relationships between words. The GloVe method produces word embeddings that are trained on a global corpus, as opposed to embeddings that are trained on a specific task or dataset.

There are different versions of the GloVe word embeddings, including 50-dimensional, 100-dimensional, and 200-dimensional embeddings. The 50-dimensional version of GloVe embeddings may be better in some applications because they have a lower dimensionality, which can make them easier to work with and more computationally efficient.

In [14]:
pbar = None
def show_progress(block_num, block_size, total_size):
    global pbar
    if pbar is None:
        pbar = progressbar.ProgressBar(maxval=total_size)
        pbar.start()

    downloaded = block_num * block_size
    if downloaded < total_size:
        pbar.update(downloaded)
    else:
        pbar.finish()
        pbar = None

# Download the GloVe embeddings file
url = 'http://nlp.stanford.edu/data/glove.6B.zip'
urllib.request.urlretrieve(url, 'glove.6B.zip', show_progress)

# Extract the zip file
zip_ref = zipfile.ZipFile('glove.6B.zip', 'r')
zip_ref.extractall()
zip_ref.close()

100% (862182613 of 862182613) |##########| Elapsed Time: 0:02:38 Time:  0:02:38


In [15]:
# Load the GloVe embeddings into a dictionary
embedding_dict = {}
embedding_dim = 300
with open(f'glove.6B.{embedding_dim}d.txt', 'r', encoding='utf-8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embedding_dict[word] = coefs

# Print the number of words in the embeddings dictionary
print(f'Found {len(embedding_dict)} word vectors.')

Found 400000 word vectors.


In [16]:
def find_closest_embeddings(embedding):
    return sorted(embedding_dict.keys(), key=lambda word: np.linalg.norm(embedding_dict[word]- embedding))[:5]

find_closest_embeddings(embedding_dict['iphone'])

['iphone', 'ipad', 'ipod', 'iphones', 'app']

In [17]:
def create_embedding_matrix(filepath, word_index, embedding_dim):
  vocab_size = len(word_index)+1
  embedding_matrix = np.zeros((vocab_size,embedding_dim))

  with open(filepath, encoding='utf-8') as f:
    for line in f:
      word, *vector = line.split()
      if word in word_index:
        idx = word_index[word]
        embedding_matrix[idx] = np.array(vector, dtype=np.float32)[:embedding_dim]
  return embedding_matrix

input_dim = X_train.shape[1]
embedding_matrix = create_embedding_matrix(f'glove.6B.{embedding_dim}d.txt', word_tokenizer.word_index, embedding_dim)

## 3. Model
### 3.1 Baseline (MACRO f1 0.84)
Bidirectional LSTM layers are able to process sequential data in both the forward and backward directions, which can allow the model to capture contextual information from both the past and the future. This can be particularly useful for natural language processing tasks, where the meaning of a word can depend on the context in which it is used.

In the context of POS tagging, TimeDistributed can be used to apply a tag prediction layer to each word in a sentence. For example, you might have an RNN that processes a sequence of words in a sentence, and at each time step, the RNN outputs a hidden state. You could then apply a TimeDistributed dense layer to the hidden states, which would allow you to predict the POS tag for each word in the sentence.

One advantage of using TimeDistributed for POS tagging is that it allows you to predict the POS tag for each word in the sentence simultaneously, rather than having to process the sentence one word at a time. This can be particularly useful when dealing with long sentences, as it can make the tagging process more efficient.

Overall, using TimeDistributed for POS tagging can help you build more accurate and efficient models for natural language processing tasks that involve sequential data.

In [18]:
# Define the model
baseline_model = tf.keras.Sequential(name='Baseline')

# Add the Embedding layer
baseline_model.add(Embedding(input_dim=vocab_size, output_dim=embedding_dim, \
                    weights = [embedding_matrix], input_length = max_len, trainable=True))

# Add the Bidirectional LSTM layer
baseline_model.add(Bidirectional(LSTM(units=64, return_sequences=True)))

# Add the Dense/Fully-Connected layer
baseline_model.add(TimeDistributed(Dense(units=len(tags_train)+1, activation='softmax')))

# Compile the model
baseline_model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Summary
baseline_model.summary()

Model: "Baseline"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 171, 300)          2700600   
                                                                 
 bidirectional (Bidirectiona  (None, 171, 128)         186880    
 l)                                                              
                                                                 
 time_distributed (TimeDistr  (None, 171, 33)          4257      
 ibuted)                                                         
                                                                 
Total params: 2,891,737
Trainable params: 2,891,737
Non-trainable params: 0
_________________________________________________________________


In [19]:
results = baseline_model.fit(X_train, y_train, epochs=10, verbose = True, validation_data=(X_val,y_val), batch_size=128)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [20]:
y_pred = baseline_model.predict(X_test)



In [21]:
# Flatten the predictions and the true labels to 1D arrays
predictions = y_pred.flatten()
y_true = y_test.flatten()

th = 0.1
predictions[predictions >= th] = 1 
predictions[predictions  < th] = 0

# Compute the F1 score
f1 = f1_score(y_true, predictions, average = 'macro')

print("Macro-F1 score:", round(f1,3))

Macro-F1 score: 0.953


### 3.2 GRU 
Unica che non funzia dio po

In [22]:
# Define the model
gru_model = tf.keras.Sequential(name='GRU')

# Add the Embedding layer
gru_model.add(Embedding(input_dim=vocab_size, output_dim=embedding_dim, \
                    weights = [embedding_matrix], input_length = max_len, trainable=True))

# Add the GRU layer
gru_model.add(GRU(units=128, return_sequences=True))

# Add the Dense/Fully-Connected layer
gru_model.add(TimeDistributed(Dense(len(tags_train)+1, activation='softmax')))

# Compile the model
gru_model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Summary
gru_model.summary()

Model: "GRU"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 171, 300)          2700600   
                                                                 
 gru (GRU)                   (None, 171, 128)          165120    
                                                                 
 time_distributed_1 (TimeDis  (None, 171, 33)          4257      
 tributed)                                                       
                                                                 
Total params: 2,869,977
Trainable params: 2,869,977
Non-trainable params: 0
_________________________________________________________________


In [23]:
results = gru_model.fit(X_train, y_train, epochs=10, verbose = True, validation_data=(X_val,y_val), batch_size=128)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [24]:
y_pred = gru_model.predict(X_test)



In [25]:
# Flatten the predictions and the true labels to 1D arrays
predictions = y_pred.flatten()
y_true = y_test.flatten()

th = 0.1
predictions[predictions >= th] = 1 
predictions[predictions  < th] = 0

# Compute the F1 score
f1 = f1_score(y_true, predictions, average = 'macro')

print("Macro-F1 score:", round(f1,3))

Macro-F1 score: 0.959


### 3.3 Additional LSTM layer (MACRO f1 0.82) 
Using two bidirectional LSTM layers can allow the model to learn more complex patterns in the data and make more accurate predictions. 
However, they can increase the computational complexity of our model, which may require more computational resources to train.

Indeed, here the train was slower and the results similar to the baseline architecture.

In [26]:
# Define the model
add_lstm_model = tf.keras.Sequential(name='Additional_LSTM')

# Add the Embedding layer
add_lstm_model.add(Embedding(input_dim=vocab_size, output_dim=embedding_dim, \
                    weights = [embedding_matrix], input_length = max_len, trainable=True))

# Add the Bidirectional LSTM layer
add_lstm_model.add(Bidirectional(LSTM(units=128, return_sequences=True)))

# Add another LSTM layer
add_lstm_model.add(Bidirectional(LSTM(units=128, return_sequences=True)))

# Add the Dense/Fully-Connected layer
add_lstm_model.add(TimeDistributed(Dense(units=len(tags_train)+1, activation='softmax')))

# Compile the model
add_lstm_model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Summary
add_lstm_model.summary()

Model: "Additional_LSTM"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_2 (Embedding)     (None, 171, 300)          2700600   
                                                                 
 bidirectional_1 (Bidirectio  (None, 171, 256)         439296    
 nal)                                                            
                                                                 
 bidirectional_2 (Bidirectio  (None, 171, 256)         394240    
 nal)                                                            
                                                                 
 time_distributed_2 (TimeDis  (None, 171, 33)          8481      
 tributed)                                                       
                                                                 
Total params: 3,542,617
Trainable params: 3,542,617
Non-trainable params: 0
_________________________________________

In [27]:
results = add_lstm_model.fit(X_train, y_train, epochs=10, verbose = True, validation_data=(X_val,y_val), batch_size=128)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [28]:
y_pred = add_lstm_model.predict(X_test)



In [29]:
# Flatten the predictions and the true labels to 1D arrays
predictions = y_pred.flatten()
y_true = y_test.flatten()

th = 0.1
predictions[predictions >= th] = 1 
predictions[predictions  < th] = 0

# Compute the F1 score
f1 = f1_score(y_true, predictions, average = 'macro')

print("Macro-F1 score:", round(f1,3))

Macro-F1 score: 0.955


### 3.4 Additional dense layer (MACRO f1 0.85)

Using two dense layers, one with a non-linear activation function and one with a softmax activation function, is a common pattern in neural network architectures for classification tasks.

The purpose of the non-linear dense layer is to introduce non-linearity into the model, which can allow the model to learn more complex patterns in the data. Common choices for the activation function in this layer include ReLU (Rectified Linear Unit), sigmoid, and tanh.

The purpose of the softmax dense layer is to produce a probability distribution over the possible classes. The softmax activation function transforms the output of the preceding layer into a probability distribution, where the sum of the probabilities is equal to 1. This is useful for classification tasks, where you want to predict the probability that an input belongs to each of the possible classes. Using two dense layers in this way can allow the model to learn more complex patterns in the data and make more accurate predictions.

In [30]:
# Define the model
add_fc_model = tf.keras.Sequential(name='Additional_FC')

# Add the Embedding layer
add_fc_model.add(Embedding(input_dim=vocab_size, output_dim=embedding_dim, \
                    weights = [embedding_matrix], input_length = max_len, trainable=True))

# Add the Bidirectional LSTM layer
add_fc_model.add(Bidirectional(LSTM(units=128, return_sequences=True)))

# Add another Dense layer
add_fc_model.add(TimeDistributed(Dense(units=max_len, activation='relu')))

# Add the Dense/Fully-Connected layer
add_fc_model.add(TimeDistributed(Dense(units=len(tags_train)+1, activation='softmax')))

# Compile the model
add_fc_model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Summary
add_fc_model.summary()

Model: "Additional_FC"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_3 (Embedding)     (None, 171, 300)          2700600   
                                                                 
 bidirectional_3 (Bidirectio  (None, 171, 256)         439296    
 nal)                                                            
                                                                 
 time_distributed_3 (TimeDis  (None, 171, 171)         43947     
 tributed)                                                       
                                                                 
 time_distributed_4 (TimeDis  (None, 171, 33)          5676      
 tributed)                                                       
                                                                 
Total params: 3,189,519
Trainable params: 3,189,519
Non-trainable params: 0
___________________________________________

In [31]:
results = add_fc_model.fit(X_train, y_train, epochs=10, verbose = True, validation_data=(X_val,y_val), batch_size=128)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [32]:
y_pred = add_fc_model.predict(X_test)



In [33]:
# Flatten the predictions and the true labels to 1D arrays
predictions = y_pred.flatten()
y_true = y_test.flatten()

th = 0.1
predictions[predictions >= th] = 1 
predictions[predictions  < th] = 0

# Compute the F1 score
f1 = f1_score(y_true, predictions, average = 'macro')

print("Macro-F1 score:", round(f1,3))

Macro-F1 score: 0.965
