https://github.com/pranavphoenix/BiLSTM-POS-Tagging/blob/main/BiLSTM_POS_Tagging.ipynb

TODO:
- Guardare creazione dizionario, bisogna rispettare i punti dell'assignment;
- Non togliere punctuation e symbols ma evitare di utilizzarli nel calcolo delle metriche, magari utilizzando l'array di pesi 'sample_weight' che si trova nell'altro notebook;
- Provare se i risultati migliorano con preprocessing (e.g. lowerando le parole);
- Aggiustare il notebook perché fa cagare;

In [1]:
import matplotlib.pyplot as plt
import nltk
import numpy as np
import os
import pandas as pd
import seaborn as sns
from collections import defaultdict
from keras import backend as K
from keras.layers import Dense, LSTM, InputLayer, Bidirectional, TimeDistributed
from keras.layers import Embedding, Activation
from keras.models import Sequential
from keras.optimizers import Adam
from keras_preprocessing.sequence import pad_sequences


import urllib.request
import zipfile
import progressbar

In [2]:
#Downloading the dataset
nltk.download('treebank')

data = nltk.corpus.treebank.tagged_sents()

#Downloading Glove Word Embeddings
pbar = None
def show_progress(block_num, block_size, total_size):
    global pbar
    if pbar is None:
        pbar = progressbar.ProgressBar(maxval=total_size)
        pbar.start()

    downloaded = block_num * block_size
    if downloaded < total_size:
        pbar.update(downloaded)
    else:
        pbar.finish()
        pbar = None

# Download the GloVe embeddings file
url = 'http://nlp.stanford.edu/data/glove.6B.zip'
urllib.request.urlretrieve(url, 'glove.6B.zip', show_progress)

# Extract the zip file
zip_ref = zipfile.ZipFile('glove.6B.zip', 'r')
zip_ref.extractall()
zip_ref.close()

[nltk_data] Downloading package treebank to /root/nltk_data...
[nltk_data]   Unzipping corpora/treebank.zip.
100% (862182613 of 862182613) |##########| Elapsed Time: 0:02:38 Time:  0:02:38


In [24]:
#Function to ignore the 0 padding while calculating accuracy
def ignore_class_accuracy(to_ignore=0):
    def ignore_accuracy(y_true, y_pred):
        y_true_class = K.argmax(y_true, axis=-1)
        y_pred_class = K.argmax(y_pred, axis=-1)
 
        ignore_mask = K.cast(K.not_equal(y_pred_class, to_ignore), 'int32')
        matches = K.cast(K.equal(y_true_class, y_pred_class), 'int32') * ignore_mask
        accuracy = K.sum(matches) / K.maximum(K.sum(ignore_mask), 1)
        return accuracy
    return ignore_accuracy

#Function to return one code encoding of tags
def one_hot_encoding(tag_sents, n_tags):
    tag_one_hot_sent = []
    for tag_sent in tag_sents:
        tags_one_hot = []
        for tag in tag_sent:
            tags_one_hot.append(np.zeros(n_tags))
            tags_one_hot[-1][tag] = 1.0
        tag_one_hot_sent.append(tags_one_hot)
    return np.array(tag_one_hot_sent)

#Function to convert output into tags
def logits_to_tags(tag_sentences, index):
    tag_sequences = []
    for tag_sentence in tag_sentences:
        tag_sequence = []
        for tag in tag_sentence:
            # if index[np.argmax(tag)] == "-PAD-":
            #     break
            # else:
                tag_sequence.append(index[np.argmax(tag)])
        tag_sequences.append(np.array(tag_sequence))
    return tag_sequences

In [3]:
# Get the files' list
fileids = nltk.corpus.treebank.fileids()

# Get the Penn Treebank corpus and tokenize the text
train_corpus = nltk.corpus.treebank.tagged_sents(fileids[:100])
val_corpus = nltk.corpus.treebank.tagged_sents(fileids[100:150])
test_corpus = nltk.corpus.treebank.tagged_sents(fileids[150:])

# Flatten the lists
ignore = [':', '#', '"', '$', '-LRB-', '-RRB-', ',', '.', "''", '``', 'SYM', '-NONE-']

# train_corpus = [tuple(list(item)+[str(idx)]) for idx,sublist in enumerate(train_corpus) for item in sublist if item[1] != '-NONE-']
# val_corpus = [tuple(list(item)+[str(idx)]) for idx,sublist in enumerate(val_corpus) for item in sublist if item[1] != '-NONE-']
# test_corpus = [tuple(list(item)+[str(idx)]) for idx,sublist in enumerate(test_corpus) for item in sublist if item[1] != '-NONE-']

train_corpus = [tuple(list(item)+[str(idx)]) for idx,sublist in enumerate(train_corpus) for item in sublist if item[1] not in ignore]
val_corpus = [tuple(list(item)+[str(idx)]) for idx,sublist in enumerate(val_corpus) for item in sublist if item[1] not in ignore]
test_corpus = [tuple(list(item)+[str(idx)]) for idx,sublist in enumerate(test_corpus) for item in sublist if item[1] not in ignore]

train_df = pd.DataFrame(train_corpus, columns = ['word', 'tag', 'sentence'])
val_df = pd.DataFrame(val_corpus, columns = ['word', 'tag', 'sentence'])
test_df = pd.DataFrame(test_corpus, columns = ['word', 'tag', 'sentence'])
print(train_df)

              word  tag sentence
0           Pierre  NNP        0
1           Vinken  NNP        0
2               61   CD        0
3            years  NNS        0
4              old   JJ        0
...            ...  ...      ...
41269  acquisition   NN     1962
41270    challenge   NN     1962
41271           he  PRP     1962
41272          has  VBZ     1962
41273        faced  VBN     1962

[41274 rows x 3 columns]


In [4]:
tags_train = sorted(list(set([x for x in train_df.tag])))
tags_val = sorted(list(set([x for x in val_df.tag])))
tags_test = sorted(list(set([x for x in test_df.tag])))

print('Train tags number:',len(tags_train))
print('Val tags number:',len(tags_val))
print('Test tags number:',len(tags_test))

if len(tags_test) != len(tags_val) or len(tags_test) != len(tags_train):
  print('\nMismatching numbers.')
  print('Removing extra classes:')

  missing_classes_train = [x for x in tags_train if x not in tags_test]
  missing_classes_val   = [x for x in tags_val if x not in tags_test]

  missing_classes = list(set(missing_classes_train + missing_classes_val))
  print(missing_classes)

  for cl in missing_classes:
    train_df = train_df[train_df.tag != cl]
    val_df = val_df[val_df.tag != cl]

  tags_train = sorted(list(set([x for x in train_df.tag])))
  tags_val = sorted(list(set([x for x in val_df.tag])))
  tags_test = sorted(list(set([x for x in test_df.tag])))

  print('\nNew Train tags number:',len(tags_train))
  print('New Val tags number:',len(tags_val))
  print('New Test tags number:',len(tags_test))

print('\nTags:')
for tag in tags_train:
  print(f'-{tag}')

Train tags number: 35
Val tags number: 35
Test tags number: 32

Mismatching numbers.
Removing extra classes:
['LS', 'FW', 'UH']

New Train tags number: 32
New Val tags number: 32
New Test tags number: 32

Tags:
-CC
-CD
-DT
-EX
-IN
-JJ
-JJR
-JJS
-MD
-NN
-NNP
-NNPS
-NNS
-PDT
-POS
-PRP
-PRP$
-RB
-RBR
-RBS
-RP
-TO
-VB
-VBD
-VBG
-VBN
-VBP
-VBZ
-WDT
-WP
-WP$
-WRB


###Vocabulary

GloVe Vocabulary (V1)

In [41]:
#Use the 300 dimensional GLove Word Embeddings
glove_dir = './'

embeddings_index =  {} #initialize dictionary
f = open(os.path.join(glove_dir, 'glove.6B.300d.txt'))
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Found %s word vectors.' % len(embeddings_index))


Found 400000 word vectors.


V1 + Training set OOV (V2)

In [42]:
embedding_dim = 300

def update_vocab(df,embeddings_index,embedding_dim):
  oov_c = 0
  for word in df.word:
    if word not in embeddings_index:
      oov_c += 1
      random_embed = np.random.rand(embedding_dim)
      embeddings_index[word] = random_embed
  print("Added",oov_c,"OOV words + respective embeddings to the vocabulary.")
  return embeddings_index

embeddings_index = update_vocab(train_df,embeddings_index,embedding_dim)

Added 2338 OOV words + respective embeddings to the vocabulary.


V2 + Validation set OOV (V3)

In [43]:
embeddings_index = update_vocab(val_df,embeddings_index,embedding_dim)

Added 942 OOV words + respective embeddings to the vocabulary.


V3 + Test set OOV (V4)

In [44]:
embeddings_index = update_vocab(test_df,embeddings_index,embedding_dim)

Added 455 OOV words + respective embeddings to the vocabulary.


In [15]:
#Building the actual word vocabulary

from collections import OrderedDict

idx2word = OrderedDict()
word2idx = OrderedDict()
    
curr_idx = 0
for key in embeddings_index.keys():
  word2idx[key] = curr_idx
  idx2word[curr_idx] = key
  curr_idx += 1

#word_listing = list(idx2word.values())

print(f'[Debug] Index -> Word vocabulary size: {len(idx2word)}')
print(f'[Debug] Word -> Index vocabulary size: {len(word2idx)}')

[Debug] Index -> Word vocabulary size: 403735
[Debug] Word -> Index vocabulary size: 403735


In [16]:
#Tag vocabulary

tag2idx = OrderedDict()

curr_id = 0

for tag in tags_train:
  tag2idx[tag] = curr_id
  curr_id += 1

print(f'[Debug] Tag -> Index vocabulary size: {len(tag2idx)}')

[Debug] Tag -> Index vocabulary size: 32


In [9]:
acc = []
conf_matrix = []

tag_list = tags_train
# The integers for each tag are the same as above

MAX_LENGTH = max([len(seq) for seq in data]) # maximum words in a sentence

conf_mat_df = pd.DataFrame(columns=tag_list, index=tag_list)
conf_mat_df = conf_mat_df.fillna(0)

In [17]:
train_sentences = train_df.groupby('sentence').word.apply(list).reset_index()['word']
val_sentences = val_df.groupby('sentence').word.apply(list).reset_index()['word']
test_sentences = test_df.groupby('sentence').word.apply(list).reset_index()['word']

train_tags = train_df.groupby('sentence').tag.apply(list).reset_index()['tag']
val_tags = val_df.groupby('sentence').tag.apply(list).reset_index()['tag']
test_tags = test_df.groupby('sentence').tag.apply(list).reset_index()['tag']

true_pos_tag = defaultdict(int)
false_pos_tag = defaultdict(int)
false_neg_tag = defaultdict(int)
precision_tags = defaultdict(float)
accuracy_tags = defaultdict(float)
recall_tags = defaultdict(float)
f1score_tags = defaultdict(float)


In [19]:
#Tokenising words and  by their indexes in vocabulary
train_sentences_X, test_sentences_X, val_sentences_X, train_tags_y, test_tags_y, val_tags_y = [], [], [], [], [], []

for sentence in train_sentences:
    sent_int = []
    for word in sentence:
        sent_int.append(word2idx[word])
   
    train_sentences_X.append(sent_int)

for sentence in val_sentences:
    sent_int = []
    for word in sentence:
        sent_int.append(word2idx[word])

    val_sentences_X.append(sent_int)

for sentence in test_sentences:
    sent_int = []
    for word in sentence:
        sent_int.append(word2idx[word])

    test_sentences_X.append(sent_int)

for sent_tags in train_tags:
    train_tags_y.append([tag2idx[tag] for tag in sent_tags])

for sent_tags in val_tags:
    val_tags_y.append([tag2idx[tag] for tag in sent_tags])

for sent_tags in test_tags:
    test_tags_y.append([tag2idx[tag] for tag in sent_tags])

#Add padding to sentences
train_sentences_X = pad_sequences(train_sentences_X, maxlen=MAX_LENGTH, padding='post')
val_sentences_X = pad_sequences(val_sentences_X, maxlen=MAX_LENGTH, padding='post')
test_sentences_X = pad_sequences(test_sentences_X, maxlen=MAX_LENGTH, padding='post')

train_tags_y = pad_sequences(train_tags_y, maxlen=MAX_LENGTH, padding='post')
val_tags_y = pad_sequences(val_tags_y, maxlen=MAX_LENGTH, padding='post')
test_tags_y = pad_sequences(test_tags_y, maxlen=MAX_LENGTH, padding='post')



In [34]:
#Building the Embedding Layer 
embedding_dim = 300

embedding_matrix = np.zeros((len(word2idx), embedding_dim))
for word, i in word2idx.items():
    embedding_vector = embeddings_index.get(word)
    if i < len(word2idx):
        embedding_matrix[i] = embedding_vector

#Building the BiLSTM model
model = Sequential()
model.add(InputLayer(input_shape=(MAX_LENGTH, ))) 
model.add(Embedding(len(word2idx), 300, weights=[embedding_matrix],trainable=False))
model.add(Bidirectional(LSTM(256, return_sequences=True)))
model.add(TimeDistributed(Dense(len(tag2idx))))
model.add(Activation('softmax'))

model.compile(loss='categorical_crossentropy', optimizer=Adam(0.001),
              metrics=['accuracy', ignore_class_accuracy(0)])
model.summary()
one_hot_train_tags_y = one_hot_encoding(train_tags_y, len(tag2idx))

#Training the model
# model.fit(train_sentences_X, one_hot_encoding(train_tags_y, len(tag2index)),\
#           validation_data=(val_sentences_X, one_hot_encoding(val_tags_y, len(tag2index))),
#           batch_size=128, epochs= 9, validation_split=0.2)
model.fit(train_sentences_X, one_hot_encoding(train_tags_y, len(tag2idx)),\
          validation_data=(val_sentences_X, one_hot_encoding(val_tags_y, len(tag2idx))),
          batch_size=64, epochs= 20)

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_2 (Embedding)     (None, 271, 300)          121120500 
                                                                 
 bidirectional_2 (Bidirectio  (None, 271, 512)         1140736   
 nal)                                                            
                                                                 
 time_distributed_2 (TimeDis  (None, 271, 32)          16416     
 tributed)                                                       
                                                                 
 activation_2 (Activation)   (None, 271, 32)           0         
                                                                 
Total params: 122,277,652
Trainable params: 1,157,152
Non-trainable params: 121,120,500
_________________________________________________________________
Epoch 1/20
Epoch 2/20
Epoch 3/20

<keras.callbacks.History at 0x7f31cdcb4e20>

In [35]:
scores = model.evaluate(test_sentences_X, one_hot_encoding(test_tags_y, len(tag2idx)))
acc.append(scores[2]*100)


predictions = model.predict(test_sentences_X)
pred_sequence = logits_to_tags(predictions, {i: t for t, i in tag2idx.items()})
#y_prob_class = model.predict_classes(test_sentences_X, verbose = 1)



In [36]:
for sen_num in range(len(test_tags)):
    for i,tag in enumerate(test_tags[sen_num]):
      
        conf_mat_df[tag][pred_sequence[sen_num][i]] +=1
        if test_tags[sen_num][i] == pred_sequence[sen_num][i]:
          true_pos_tag[tag] += 1
        else:
          false_neg_tag[tag] += 1
          false_pos_tag[pred_sequence[sen_num][i]] += 1

for tag in tag_list[1:]:
    if (true_pos_tag[tag] + false_pos_tag[tag]) != 0:
      precision_tags[tag] = true_pos_tag[tag] / (true_pos_tag[tag] + false_pos_tag[tag])
      recall_tags[tag] = true_pos_tag[tag] / (true_pos_tag[tag] + false_neg_tag[tag])
      f1score_tags[tag] = 2 * precision_tags[tag] * recall_tags[tag] / (precision_tags[tag] + recall_tags[tag])
      accuracy_tags[tag] = true_pos_tag[tag] / (true_pos_tag[tag] + false_neg_tag[tag] + false_pos_tag[tag])

In [37]:
sum = 0
for el in f1score_tags.items():
  sum += el[1]
print(sum/len(f1score_tags.items()))

0.5487107560173037
