# Assignment 1

**Due to**: 11/01/2022 (dd/mm/yyyy)

If you deliver it by 11/12/2021 your assignment will be graded by 11/01/2022.


**Credits**: Andrea Galassi, Federico Ruggeri, Paolo Torroni

**Summary**: Part-of Speech (POS) tagging as Sequence Labelling using Recurrent Neural Architectures

# Execution



In [1]:
import re
import pandas as pd
import numpy as np
import os
import urllib.request
import zipfile
import progressbar

import nltk
import sklearn
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from keras.preprocessing.text import Tokenizer
from keras_preprocessing.sequence import pad_sequences

import tensorflow as tf
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dense, GRU
from sklearn.metrics import classification_report

nltk.download('treebank')
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package treebank to /root/nltk_data...
[nltk_data]   Package treebank is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
def pre_process(df,string):
  def text_pre_process(text):
      ret = re.sub("RT @(.)+?:\s|(&#[0-9]+;)|@([\w\-]+)|(#)\S+|(http)s?\S+|&gt;|^\s+|\b\s+|\n", "", text)
      ret = re.sub("\s\s+|[^a-zA-Z\d\s:]" , " ", ret).rstrip().lower()
      return ret
  return df[string].apply(text_pre_process)

def Encoding(df,Tags):
  label_encoder = sklearn.preprocessing.LabelEncoder()
  X_lab = label_encoder.fit_transform(df)
  OneHot_encoder = sklearn.preprocessing.OneHotEncoder()
  X = OneHot_encoder.fit_transform(X_lab.reshape(-1,1)).toarray()
  dfOneHot = pd.DataFrame(X, columns = [i for i in Tags])
  df.reset_index(inplace=True,drop=True)
  df = pd.concat([df,dfOneHot],axis=1)
  return df

def create_embedding_matrix(filepath, word_index, embedding_dim):
  vocab_size = len(word_index)+1
  embedding_matrix = np.zeros((vocab_size,embedding_dim))

  with open(filepath, encoding='utf-8') as f:
    for line in f:
      word, *vector = line.split()
      if word in word_index:
        idx = word_index[word]
        embedding_matrix[idx] = np.array(vector, dtype=np.float32)[:embedding_dim]
  return embedding_matrix

## 1. Corpus
### 1.1 Pre-processing

From the original tags list we removed all the symbols and english punctuation plus:
- FW, Foreign Word, because there are no examples in the test set;
- UH, Interjection, because there are no examples in the test set;
- LS, List Item Marker, because there are no examples in the test set (and because it marks symbols too);

In [3]:
# Get the files' list
fileids = nltk.corpus.treebank.fileids()

# Get the Penn Treebank corpus and tokenize the text
train_corpus = nltk.corpus.treebank.tagged_sents(fileids[:100])
val_corpus = nltk.corpus.treebank.tagged_sents(fileids[100:150])
test_corpus = nltk.corpus.treebank.tagged_sents(fileids[150:])

# Flatten the lists
train_corpus = [item for sublist in train_corpus for item in sublist if item[1] != '-NONE-']
val_corpus = [item for sublist in val_corpus for item in sublist if item[1] != '-NONE-']
test_corpus = [item for sublist in test_corpus for item in sublist if item[1] != '-NONE-']

In [4]:
train_df = pd.DataFrame(train_corpus, columns = ['word', 'tag'])

train_df['word'] = pre_process(train_df,'word')

remove = [':', '#', '"', '$', '-LRB-', '-RRB-', ',', '.', "''", '``', 'SYM']
for r in remove:
  train_df['tag'].replace(r, np.nan, inplace=True)
train_df.dropna(inplace=True)


tags_train = sorted(list(set([x for x in train_df.tag])))

print(train_df.shape)
train_df.describe()

(41274, 2)


Unnamed: 0,word,tag
count,41274,41274
unique,7359,35
top,the,NN
freq,2329,6270


In [5]:
val_df = pd.DataFrame(val_corpus, columns = ['word', 'tag'])

val_df['word'] = pre_process(val_df,'word')

remove = [':', '#', '"', '$', '-LRB-', '-RRB-', ',', '.', "''", '``', 'SYM']
for r in remove:
  val_df['tag'].replace(r, np.nan, inplace=True)
val_df.dropna(inplace=True)


tags_val = sorted(list(set([x for x in val_df.tag])))

print(val_df.shape)
val_df.describe()

(27418, 2)


Unnamed: 0,word,tag
count,27418,27418
unique,5385,35
top,the,NN
freq,1670,4513


In [6]:
test_df = pd.DataFrame(test_corpus, columns = ['word', 'tag'])

test_df['word'] = pre_process(test_df,'word')

remove = [':', '#', '"', '$', '-LRB-', '-RRB-', ',', '.', "''", '``', 'SYM']
for r in remove:
  test_df['tag'].replace(r, np.nan, inplace=True)
test_df.dropna(inplace=True)


tags_test = sorted(list(set([x for x in test_df.tag])))

print(test_df.shape)
test_df.describe()

(13676, 2)


Unnamed: 0,word,tag
count,13676,13676
unique,3383,32
top,the,NN
freq,765,2383


In [7]:
print('Train:',len(tags_train))
print('Val:',len(tags_val))
print('Test:',len(tags_test))

if len(tags_test) != len(tags_val) or len(tags_test) != len(tags_train):
  print('Mismatching number of classes.')
else:
  print('\nTags:')
  for tag in tags_train:
    print(f'-{tag}')

Train: 35
Val: 35
Test: 32
Mismatching number of classes.


In [8]:
missing_classes_train = [x for x in tags_train if x not in tags_test]
missing_classes_val   = [x for x in tags_val if x not in tags_test]

missing_classes = list(set(missing_classes_train + missing_classes_val))

for cl in missing_classes:
  train_df = test_df[test_df.tag != cl]
  val_df = test_df[test_df.tag != cl]

tags_train = sorted(list(set([x for x in train_df.tag])))
tags_val = sorted(list(set([x for x in val_df.tag])))
tags_test = sorted(list(set([x for x in test_df.tag])))

print('Train:',len(tags_train))
print('Val:',len(tags_val))
print('Test:',len(tags_test))

if len(tags_test) != len(tags_val) or len(tags_test) != len(tags_train):
  print('Mismatching number of classes.')
else:
  print('\nTags:')
  for tag in tags_train:
    print(f'-{tag}')

Train: 32
Val: 32
Test: 32

Tags:
-CC
-CD
-DT
-EX
-IN
-JJ
-JJR
-JJS
-MD
-NN
-NNP
-NNPS
-NNS
-PDT
-POS
-PRP
-PRP$
-RB
-RBR
-RBS
-RP
-TO
-VB
-VBD
-VBG
-VBN
-VBP
-VBZ
-WDT
-WP
-WP$
-WRB


In [9]:
label_encoder = LabelEncoder()

train_df['tag'] = label_encoder.fit_transform(train_df['tag'])
test_df['tag']  = label_encoder.fit_transform(test_df['tag'])
val_df['tag']   = label_encoder.fit_transform(val_df['tag'])

print(train_df['tag'].unique())
print(val_df['tag'].unique())
print(test_df['tag'].unique())

[10 23 15  1 12  0  4  9 16  5  2 27 14 21  8 22 24 17 25 29 26 31 28  6
  7 20 11 18  3 19 30 13]
[10 23 15  1 12  0  4  9 16  5  2 27 14 21  8 22 24 17 25 29 26 31 28  6
  7 20 11 18  3 19 30 13]
[10 23 15  1 12  0  4  9 16  5  2 27 14 21  8 22 24 17 25 29 26 31 28  6
  7 20 11 18  3 19 30 13]


In [10]:
X_train = train_df.word
y_train = train_df.tag
y_train = Encoding(y_train, tags_train)

print(y_train.shape)
print(y_train.head())

X_val = val_df.word
y_val = val_df.tag
y_val = Encoding(y_val, tags_val)
print(y_val.shape)
print(y_val.head())

X_test = test_df.word
y_test = test_df.tag
y_test = Encoding(y_test, tags_test)
print(y_test.shape)
print(y_test.head())

y_train.drop('tag',inplace=True,axis=1)
y_test.drop('tag',inplace=True,axis=1)
y_val.drop('tag',inplace=True,axis=1)

(13676, 33)
   tag   CC   CD   DT   EX   IN   JJ  JJR  JJS   MD  ...   VB  VBD  VBG  VBN  \
0   10  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0  0.0   
1   10  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0  0.0   
2   10  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0  0.0   
3   10  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0  0.0   
4   10  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0  0.0   

   VBP  VBZ  WDT   WP  WP$  WRB  
0  0.0  0.0  0.0  0.0  0.0  0.0  
1  0.0  0.0  0.0  0.0  0.0  0.0  
2  0.0  0.0  0.0  0.0  0.0  0.0  
3  0.0  0.0  0.0  0.0  0.0  0.0  
4  0.0  0.0  0.0  0.0  0.0  0.0  

[5 rows x 33 columns]
(13676, 33)
   tag   CC   CD   DT   EX   IN   JJ  JJR  JJS   MD  ...   VB  VBD  VBG  VBN  \
0   10  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0  0.0   
1   10  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0  0.0   
2   10  0.0  0.0  0.0  0.0  

In [11]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train)

num_words = 9000
tokenizer.word_index = {e:i for e,i in tokenizer.word_index.items() if i <= num_words}
tokenizer.word_index[tokenizer.oov_token] = num_words + 1

X_train = tokenizer.texts_to_sequences(X_train)
X_val = tokenizer.texts_to_sequences(X_val)
X_test = tokenizer.texts_to_sequences(X_test)
vocab_size = len(tokenizer.word_index) + 1

In [12]:
max_len = 50
X_train = pad_sequences(X_train,padding='post',maxlen=max_len)
X_val = pad_sequences(X_val,padding='post',maxlen=max_len)
X_test = pad_sequences(X_test,padding='post',maxlen=max_len)

## 2. GloVe 

In [13]:
pbar = None
def show_progress(block_num, block_size, total_size):
    global pbar
    if pbar is None:
        pbar = progressbar.ProgressBar(maxval=total_size)
        pbar.start()

    downloaded = block_num * block_size
    if downloaded < total_size:
        pbar.update(downloaded)
    else:
        pbar.finish()
        pbar = None

# Download the GloVe embeddings file
url = 'http://nlp.stanford.edu/data/glove.6B.zip'
urllib.request.urlretrieve(url, 'glove.6B.zip', show_progress)

# Extract the zip file
zip_ref = zipfile.ZipFile('glove.6B.zip', 'r')
zip_ref.extractall()
zip_ref.close()

100% (862182613 of 862182613) |##########| Elapsed Time: 0:02:39 Time:  0:02:39


In [14]:
# Load the GloVe embeddings into a dictionary
embedding_dict = {}
with open('glove.6B.50d.txt', 'r', encoding='utf-8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embedding_dict[word] = coefs

# Print the number of words in the embeddings dictionary
print(f'Found {len(embedding_dict)} word vectors.')

Found 400000 word vectors.


In [15]:
def find_closest_embeddings(embedding):
    return sorted(embedding_dict.keys(), key=lambda word: np.linalg.norm(embedding_dict[word]- embedding))[:5]

find_closest_embeddings(embedding_dict['iphone'])

['iphone', 'ipad', 'smartphone', 'ipod', 'android']

In [16]:
input_dim = X_train.shape[1]
embedding_dim = 50
embedding_matrix = create_embedding_matrix(f'glove.6B.{embedding_dim}d.txt', tokenizer.word_index, embedding_dim)

## 3. Model
### 3.1 Baseline

In [17]:
# Define the model
model = tf.keras.Sequential(name='Baseline')

# Add the Embedding layer
model.add(Embedding(input_dim=vocab_size, output_dim=embedding_dim, \
                    weights = [embedding_matrix], input_length = max_len, trainable=True))

# Add the Bidirectional LSTM layer
model.add(Bidirectional(LSTM(units=128)))

# Add the Dense/Fully-Connected layer
model.add(Dense(units=len(tags_train), activation='softmax'))

# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Summary
model.summary()

Model: "Baseline"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 50, 50)            158450    
                                                                 
 bidirectional (Bidirectiona  (None, 256)              183296    
 l)                                                              
                                                                 
 dense (Dense)               (None, 32)                8224      
                                                                 
Total params: 349,970
Trainable params: 349,970
Non-trainable params: 0
_________________________________________________________________


In [18]:
results = model.fit(X_train, y_train, epochs=10, verbose = True, validation_data=(X_val,y_val), batch_size=32)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [19]:
y_pred = model.predict(X_test)



In [20]:
print(y_pred.shape)
print(y_test.shape)

(13676, 32)
(13676, 32)


In [21]:
th = 0.1
y_pred[y_pred >= th] = 1 
y_pred[y_pred  < th] = 0

print(classification_report(y_test, y_pred, target_names = tags_train, zero_division=True))

              precision    recall  f1-score   support

          CC       1.00      0.96      0.98       366
          CD       1.00      1.00      1.00       858
          DT       0.99      0.99      0.99      1335
          EX       0.83      1.00      0.91         5
          IN       0.93      1.00      0.96      1630
          JJ       0.82      1.00      0.90       918
         JJR       0.79      1.00      0.88        59
         JJS       0.91      1.00      0.95        31
          MD       0.96      1.00      0.98       167
          NN       0.88      0.99      0.93      2383
         NNP       0.81      0.98      0.89      1504
        NNPS       0.84      0.84      0.84        44
         NNS       0.93      1.00      0.96       941
         PDT       0.12      0.50      0.20         4
         POS       0.94      0.95      0.94       152
         PRP       1.00      1.00      1.00       192
        PRP$       0.99      1.00      0.99        99
          RB       0.89    

### 3.2 GRU 

In [22]:
# Define the model
model = tf.keras.Sequential(name='GRU')

# Add the Embedding layer
model.add(Embedding(input_dim=vocab_size, output_dim=embedding_dim, \
                    weights = [embedding_matrix], input_length = max_len, trainable=True))

# Add the GRU layer
model.add(GRU(units=128))

# Add the Dense/Fully-Connected layer
model.add(Dense(units=len(tags_train), activation='softmax'))

# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Summary
model.summary()

Model: "GRU"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 50, 50)            158450    
                                                                 
 gru (GRU)                   (None, 128)               69120     
                                                                 
 dense_1 (Dense)             (None, 32)                4128      
                                                                 
Total params: 231,698
Trainable params: 231,698
Non-trainable params: 0
_________________________________________________________________


In [23]:
results = model.fit(X_train, y_train, epochs=10, verbose = True, validation_data=(X_val,y_val), batch_size=32)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [24]:
y_pred = model.predict(X_test)



In [25]:
print(y_pred.shape)
print(y_test.shape)

(13676, 32)
(13676, 32)


In [26]:
th = 0.1
y_pred[y_pred >= th] = 1 
y_pred[y_pred  < th] = 0

print(classification_report(y_test, y_pred, target_names = tags_train, zero_division=True))

              precision    recall  f1-score   support

          CC       0.78      0.99      0.87       366
          CD       0.99      1.00      1.00       858
          DT       0.99      0.99      0.99      1335
          EX       0.83      1.00      0.91         5
          IN       0.93      0.99      0.96      1630
          JJ       0.80      0.99      0.88       918
         JJR       0.70      0.97      0.81        59
         JJS       0.64      0.97      0.77        31
          MD       0.89      1.00      0.94       167
          NN       0.87      0.99      0.93      2383
         NNP       0.80      0.98      0.88      1504
        NNPS       0.76      0.84      0.80        44
         NNS       0.93      0.99      0.96       941
         PDT       1.00      0.00      0.00         4
         POS       0.89      0.95      0.92       152
         PRP       0.90      0.98      0.94       192
        PRP$       0.95      1.00      0.98        99
          RB       0.72    

### 3.3 Additional LSTM layer 

In [27]:
# Define the model
model = tf.keras.Sequential(name='Baseline')

# Add the Embedding layer
model.add(Embedding(input_dim=vocab_size, output_dim=embedding_dim, \
                    weights = [embedding_matrix], input_length = max_len, trainable=True))

# Add the Bidirectional LSTM layer
model.add(Bidirectional(LSTM(units=128, return_sequences=True)))

# Add another LSTM layer
model.add(LSTM(units=128))

# Add the Dense/Fully-Connected layer
model.add(Dense(units=len(tags_train), activation='softmax'))

# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Summary
model.summary()

Model: "Baseline"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_2 (Embedding)     (None, 50, 50)            158450    
                                                                 
 bidirectional_1 (Bidirectio  (None, 50, 256)          183296    
 nal)                                                            
                                                                 
 lstm_2 (LSTM)               (None, 128)               197120    
                                                                 
 dense_2 (Dense)             (None, 32)                4128      
                                                                 
Total params: 542,994
Trainable params: 542,994
Non-trainable params: 0
_________________________________________________________________


In [28]:
results = model.fit(X_train, y_train, epochs=10, verbose = True, validation_data=(X_val,y_val), batch_size=32)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [29]:
y_pred = model.predict(X_test)



In [30]:
print(y_pred.shape)
print(y_test.shape)

(13676, 32)
(13676, 32)


In [31]:
th = 0.1
y_pred[y_pred >= th] = 1 
y_pred[y_pred  < th] = 0

print(classification_report(y_test, y_pred, target_names = tags_train, zero_division=True))

              precision    recall  f1-score   support

          CC       1.00      0.00      0.00       366
          CD       1.00      0.00      0.00       858
          DT       1.00      0.00      0.00      1335
          EX       1.00      0.00      0.00         5
          IN       0.12      1.00      0.21      1630
          JJ       0.14      0.02      0.04       918
         JJR       1.00      0.00      0.00        59
         JJS       1.00      0.00      0.00        31
          MD       1.00      0.00      0.00       167
          NN       0.17      1.00      0.30      2383
         NNP       0.11      1.00      0.20      1504
        NNPS       1.00      0.00      0.00        44
         NNS       0.29      0.04      0.07       941
         PDT       1.00      0.00      0.00         4
         POS       1.00      0.00      0.00       152
         PRP       1.00      0.00      0.00       192
        PRP$       1.00      0.00      0.00        99
          RB       1.00    


### 3.4 Additional dense layer

In [32]:
# Define the model
model = tf.keras.Sequential(name='Baseline')

# Add the Embedding layer
model.add(Embedding(input_dim=vocab_size, output_dim=embedding_dim, \
                    weights = [embedding_matrix], input_length = max_len, trainable=True))

# Add the Bidirectional LSTM layer
model.add(Bidirectional(LSTM(units=128)))

# Add another Dense layer
model.add(Dense(units=256, activation='softmax'))

# Add the Dense/Fully-Connected layer
model.add(Dense(units=len(tags_train), activation='softmax'))

# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Summary
model.summary()

Model: "Baseline"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_3 (Embedding)     (None, 50, 50)            158450    
                                                                 
 bidirectional_2 (Bidirectio  (None, 256)              183296    
 nal)                                                            
                                                                 
 dense_3 (Dense)             (None, 256)               65792     
                                                                 
 dense_4 (Dense)             (None, 32)                8224      
                                                                 
Total params: 415,762
Trainable params: 415,762
Non-trainable params: 0
_________________________________________________________________


In [33]:
results = model.fit(X_train, y_train, epochs=10, verbose = True, validation_data=(X_val,y_val), batch_size=32)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [34]:
y_pred = model.predict(X_test)



In [35]:
print(y_pred.shape)
print(y_test.shape)

(13676, 32)
(13676, 32)


In [36]:
th = 0.1
y_pred[y_pred >= th] = 1 
y_pred[y_pred  < th] = 0

print(classification_report(y_test, y_pred, target_names = tags_train, zero_division=True))

              precision    recall  f1-score   support

          CC       0.19      0.96      0.32       366
          CD       0.14      0.99      0.24       858
          DT       0.58      0.99      0.73      1335
          EX       1.00      0.00      0.00         5
          IN       0.22      0.99      0.36      1630
          JJ       0.21      0.93      0.35       918
         JJR       1.00      0.00      0.00        59
         JJS       1.00      0.00      0.00        31
          MD       1.00      0.00      0.00       167
          NN       0.30      0.98      0.46      2383
         NNP       0.20      0.97      0.33      1504
        NNPS       1.00      0.00      0.00        44
         NNS       0.63      0.96      0.76       941
         PDT       1.00      0.00      0.00         4
         POS       1.00      0.00      0.00       152
         PRP       1.00      0.00      0.00       192
        PRP$       1.00      0.00      0.00        99
          RB       0.15    