<a href="https://colab.research.google.com/github/andreunifi/Bert-POS-Tagging-Thesis/blob/main/Copia_di_Copia_di_Pos_tagging_with_Bert_Fine_tuning_TESI_Updated.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

[BERT](https://arxiv.org/abs/1810.04805) is known to be good at Sequence tagging tasks like Named Entity Recognition. Let's see if it's true for POS-tagging.

In [None]:
__author__ = "kyubyong"
__address__ = "https://github.com/kyubyong/nlp_made_easy"
__email__ = "kbpark.linguist@gmail.com"

In [1]:
%load_ext tensorboard

SyntaxError: invalid syntax (<ipython-input-3-5b7381d8b912>, line 1)

In [2]:
import os
from tqdm import tqdm_notebook as tqdm
import numpy as np
import torch
import torch.nn as nn
from torch.utils import data
import torch.optim as optim
from transformers import BertTokenizer

In [None]:
torch.__version__

# Data preparation

Thanks to the great NLTK, we don't have to worry about datasets. Some of Penn Tree Banks are included in it. I believe they serves for the purpose.

In [3]:
import nltk
nltk.download('treebank')
tagged_sents = nltk.corpus.treebank.tagged_sents()
len(tagged_sents)

[nltk_data] Downloading package treebank to /root/nltk_data...
[nltk_data]   Unzipping corpora/treebank.zip.


3914

In [None]:
tagged_sents[1]

[('Mr.', 'NNP'),
 ('Vinken', 'NNP'),
 ('is', 'VBZ'),
 ('chairman', 'NN'),
 ('of', 'IN'),
 ('Elsevier', 'NNP'),
 ('N.V.', 'NNP'),
 (',', ','),
 ('the', 'DT'),
 ('Dutch', 'NNP'),
 ('publishing', 'VBG'),
 ('group', 'NN'),
 ('.', '.')]

In [6]:
tags = list(set(word_pos[1] for sent in tagged_sents for word_pos in sent))

In [7]:
",".join(tags)

"CD,#,-RRB-,PRP$,SYM,PRP,PDT,,,.,VBZ,CC,JJS,-LRB-,``,MD,VB,UH,WP$,NNP,RP,NNS,-NONE-,EX,NN,VBN,WRB,'',DT,WDT,IN,RB,WP,NNPS,VBG,JJR,RBR,TO,VBP,POS,$,LS,:,JJ,FW,RBS,VBD"

In [8]:
# By convention, the 0'th slot is reserved for padding.
tags = ["<pad>"] + tags

In [9]:
tag2idx = {tag:idx for idx, tag in enumerate(tags)}
idx2tag = {idx:tag for idx, tag in enumerate(tags)}

In [10]:
# Let's split the data into train and test (or eval)
from sklearn.model_selection import train_test_split
train_data, test_data = train_test_split(tagged_sents, test_size=.1)
len(train_data), len(test_data)

(3522, 392)

In [11]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

'cuda'

# Data loader


In [12]:
# Provare a cambiare i tokenizers e tracciare il comportamento come tempi ed accuratezza.
tokenizer = BertTokenizer.from_pretrained('bert-base-cased', do_lower_case=False)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

# **PosDataset class extends Pythorch Dataset**

>
It looks like you're defining a custom dataset class, PosDataset, for part-of-speech tagging using BERT. Let's break down the key components of this class:

Initialization (__init__ method):

tagged_sents: The input parameter representing a list of tagged sentences. Each sentence is a list of tuples, where each tuple contains a word and its corresponding part-of-speech tag.
Data Processing in Initialization:

sents and tags_li: Lists to store tokenized sentences and their corresponding part-of-speech tags. The special tokens [CLS] and [SEP] are added to the beginning and end of each sentence.
Tokenization is performed using the BERT tokenizer, and the tokenized sentences (sents) and part-of-speech tags (tags_li) are stored in the class.
Length Method (__len__):

Returns the number of sentences in the dataset.
Get Item Method (__getitem__):

Retrieves an item from the dataset by index.
words and tags: Original words and part-of-speech tags for the current sentence.
x, is_heads, and y: Lists for tokenized words, indicator of whether a token is the first piece of a word, and corresponding part-of-speech tag indices, respectively.
Tokenization and conversion to indices are performed using the BERT tokenizer and the provided tag2idx mapping.
The method returns the original words, tokenized word IDs (x), indicator for the first piece of each word (is_heads), original part-of-speech tags, part-of-speech tag IDs (y), and the sequence length.



In [13]:
class PosDataset(data.Dataset):
    def __init__(self, tagged_sents, bertokenizer ):
        sents, tags_li = [], [] # list of lists
        self.Tokenizer = bertokenizer;
        for sent in tagged_sents:
            words = [word_pos[0] for word_pos in sent]
            tags = [word_pos[1] for word_pos in sent]
            sents.append(["[CLS]"] + words + ["[SEP]"])
            tags_li.append(["<pad>"] + tags + ["<pad>"])
        self.sents, self.tags_li = sents, tags_li

    def __len__(self):
        return len(self.sents)

    def __getitem__(self, idx):
        words, tags = self.sents[idx], self.tags_li[idx] # words, tags: string list

        # We give credits only to the first piece.
        x, y = [], [] # list of ids
        is_heads = [] # list. 1: the token is the first piece of a word
        for w, t in zip(words, tags):
            tokens = self.Tokenizer.tokenize(w) if w not in ("[CLS]", "[SEP]") else [w]
            xx = self.Tokenizer.convert_tokens_to_ids(tokens)

            is_head = [1] + [0]*(len(tokens) - 1)

            t = [t] + ["<pad>"] * (len(tokens) - 1)  # <PAD>: no decision
            yy = [tag2idx[each] for each in t]  # (T,)

            x.extend(xx)
            is_heads.extend(is_head)
            y.extend(yy)

        assert len(x)==len(y)==len(is_heads), "len(x)={}, len(y)={}, len(is_heads)={}".format(len(x), len(y), len(is_heads))

        # seqlen
        seqlen = len(y)

        # to string
        words = " ".join(words)
        tags = " ".join(tags)
        return words, x, is_heads, tags, y, seqlen


In [14]:
dataset = PosDataset(tagged_sents,tokenizer)
dataset[4]

('[CLS] The asbestos fiber , crocidolite , is unusually resilient once it enters the lungs , with even brief exposures to it causing symptoms that *T*-1 show up decades later , researchers said 0 *T*-2 . [SEP]',
 [101,
  1109,
  1112,
  12866,
  11990,
  12753,
  117,
  172,
  2180,
  16388,
  11014,
  1566,
  117,
  1110,
  14624,
  1231,
  5053,
  19526,
  1517,
  1122,
  7603,
  1103,
  8682,
  117,
  1114,
  1256,
  4094,
  7401,
  1116,
  1106,
  1122,
  3989,
  8006,
  1115,
  115,
  157,
  115,
  118,
  122,
  1437,
  1146,
  4397,
  1224,
  117,
  6962,
  1163,
  121,
  115,
  157,
  115,
  118,
  123,
  119,
  102],
 [1,
  1,
  1,
  0,
  0,
  1,
  1,
  1,
  0,
  0,
  0,
  0,
  1,
  1,
  1,
  1,
  0,
  0,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  0,
  1,
  1,
  1,
  1,
  1,
  1,
  0,
  0,
  0,
  0,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  0,
  0,
  0,
  0,
  1,
  1],
 '<pad> DT NN NN , NN , VBZ RB JJ IN PRP VBZ DT NNS , IN RB JJ NNS TO PRP VBG NNS WDT -NONE- VBP

# **Pad function**


> The pad function appears to be a data processing function that pads the input batch to the length of the longest sequence in the batch. Let's break down the key components of this function:

Input Parameters:

> batch: A batch of samples, where each sample is a tuple containing information about words, tokenized word IDs (x), indicator for the first piece of each word (is_heads), original part-of-speech tags, part-of-speech tag IDs (y), and sequence length.
Processing Steps:

>Extract relevant information from the batch using lambda functions (f). f(0), f(2), f(3), and f(-1) extract words, indicator for the first piece of each word, original part-of-speech tags, and sequence lengths, respectively.

Finds the maximum sequence length (maxlen) in the batch.

>Define a lambda function f that pads sequences to a specified length (seqlen). This function is used to pad both the tokenized word IDs (x) and part-of-speech tag IDs (y). Padding is done with zeros (0), which likely corresponds to the <pad> token.

Apply the padding function to the tokenized word IDs (x) and part-of-speech tag IDs (y) using the maximum sequence length (maxlen).

Convert the padded tokenized word IDs (x) and part-of-speech tag IDs (y) to PyTorch LongTensors using torch.LongTensor.

>Return the padded words, tokenized word IDs (x), indicator for the first piece of each word (is_heads), original part-of-speech tags, padded part-of-speech tag IDs (y), and sequence lengths.

>In summary, this function is used to pad a batch of sequences to the length of the longest sequence in the batch, making it suitable for input to a neural network where all sequences in a batch must have the same length.



In [15]:
def pad(batch):
    '''Pads to the longest sample'''
    f = lambda x: [sample[x] for sample in batch]
    words = f(0)
    is_heads = f(2)
    tags = f(3)
    seqlens = f(-1)
    maxlen = np.array(seqlens).max()

    f = lambda x, seqlen: [sample[x] + [0] * (seqlen - len(sample[x])) for sample in batch] # 0: <pad>
    x = f(1, maxlen)
    y = f(-2, maxlen)


    f = torch.LongTensor

    return words, f(x), is_heads, tags, f(y), seqlens

# Model

In [16]:
from transformers import BertModel
from sklearn.metrics import accuracy_score

In [17]:
%%time
class Net(nn.Module):
    def __init__(self, vocab_size=None, bertmodel=None):
        super().__init__()
        # Provare a cambiare i modelli e tracciare il comportamento come tempi ed accuratezza.
        self.bert = bertmodel;
        # Qui puoi divertirti a cambiare o embedding size se si puo'
        # o a cambiare la rete neurale di uscita, che adesso e' un
        # semplice singolo layer linear.
        self.fc = nn.Linear(self.bert.config.hidden_size, vocab_size)   #changed linear input sizes to match the bert hidden sizes tensor. Bert- Standard has 768 hidden layers but Bert Larhe has 1024
        #self.fc = [nn.Dense(768, 512), nn.Linear(512, vocab_size)]
        self.device = device
    def forward(self, x):
        '''
        x: (N, T). int64
        y: (N, T). int64
        '''
        x = x.to(self.device)
        if self.training:
            self.bert.train()
            encoded_data = self.bert(x).last_hidden_state
        else:
            self.bert.eval()
            with torch.no_grad():
              encoded_data = self.bert(x).last_hidden_state
        logits = self.fc(encoded_data)
        y_hat = logits.argmax(-1)
        return logits, y_hat

CPU times: user 28 µs, sys: 0 ns, total: 28 µs
Wall time: 31 µs


# Train an evaluate

In [18]:
from torch.utils.tensorboard import SummaryWriter
log_dir="./logs"
  #tensorboard evalutaion code
writer = SummaryWriter(log_dir=log_dir)

    #tensorboard evaluation code end

In [19]:
def eval(model, iterator):
    model.eval()

    Words, Is_heads, Tags, Y, Y_hat = [], [], [], [], []
    with torch.no_grad():
        for i, batch in enumerate(iterator):
            words, x, is_heads, tags, y, seqlens = batch

            _, y_hat = model(x)  # y_hat: (N, T)

            Words.extend(words)
            Is_heads.extend(is_heads)
            Tags.extend(tags)
            Y.extend(y.numpy().tolist())
            Y_hat.extend(y_hat.cpu().numpy().tolist())

    ## gets results and save
    with open("result", 'w') as fout:
        for words, is_heads, tags, y_hat in zip(Words, Is_heads, Tags, Y_hat):
            y_hat = [hat for head, hat in zip(is_heads, y_hat) if head == 1]
            preds = [idx2tag[hat] for hat in y_hat]
            assert len(preds)==len(words.split())==len(tags.split())
            for w, t, p in zip(words.split()[1:-1], tags.split()[1:-1], preds[1:-1]):
                fout.write("{} {} {}\n".format(w, t, p))
            fout.write("\n")

    ## calc metric
    y_true =  np.array([tag2idx[line.split()[1]] for line in open('result', 'r').read().splitlines() if len(line) > 0])
    y_pred =  np.array([tag2idx[line.split()[2]] for line in open('result', 'r').read().splitlines() if len(line) > 0])

    acc = (y_true==y_pred).astype(np.int32).sum() / len(y_true)

    return acc;


In [None]:
def eval(batch,model,y_pred):
  model.eval()

  words, x, is_heads, tags, y, seqlens = batch







## Load model and train, variation of Tokenizer and models


In [None]:
def train(model, iterator, optimizer, criterion):

    model.to(device)

    #iterator.to(device)

    #optimizer.to(device)

    criterion.to(device)

    model.train()
    total_correct = 0
    total_samples = 0                #attenzione:total_samples qui viene aggiornato ad ogni "batch", ricalcola la accuracy ogni volta. E' corretto?
    totals=enumerate(iterator)


    for i, batch in totals:
        words, x, is_heads, tags, y, seqlens = batch

        x, y = x.to(device), y.to(device)

        _y = y # for monitoring
        optimizer.zero_grad()
        logits, _ = model(x) # logits: (N, T, VOCAB), y: (N, T)


        logits = logits.view(-1, logits.shape[-1]) # (N*T, VOCAB)
        y = y.view(-1)  # (N*T,)


        loss = criterion(logits, y)


        loss.backward()



        optimizer.step()
        _, predicted_labels = torch.max(logits, 1)
        correct = (predicted_labels == y).sum().item()
        total_correct += correct
        total_samples += y.size(0)
        accuracy = total_correct / total_samples * 100


        if i%10==0: # monitoring
            print("step: {}, loss: {}, accuracy: {}".format(i, loss.item(),accuracy))
            writer.add_scalar("Training "  + model.module.bert.config.name_or_path + " Loss/Step", loss.item(), i)
            writer.add_scalar("Training " + model.module.bert.config.name_or_path + " Accuracy/Step %", accuracy, i)


In [28]:
def train_multi_epochs(model, train_dataset, test_dataset, optimizer, criterion, num_epochs):


   model.to(device)
   #iterator.to(device)
   #optimizer.to(device)
   criterion.to(device)


   model.train()
   train_iterator = data.DataLoader(dataset=train_dataset,
                                    batch_size=24,
                                    shuffle=True,
                                    num_workers=1,
                                    collate_fn=pad)
   test_iterator = data.DataLoader(dataset=test_dataset,
                                   batch_size=64,
                                   shuffle=False,
                                   num_workers=1,
                                   collate_fn=pad)
   totals=enumerate(train_iterator)
   num_steps = len(totals)
   for e in range(num_epochs):
     for i, batch in totals:
       words, x, is_heads, tags, y, seqlens = batch


       x, y = x.to(device), y.to(device)


       _y = y # for monitoring
       optimizer.zero_grad()
       logits, _ = model(x) # logits: (N, T, VOCAB), y: (N, T)
       logits = logits.view(-1, logits.shape[-1]) # (N*T, VOCAB)
       y = y.view(-1)  # (N*T,)
       loss = criterion(logits, y)
       loss.backward()
       optimizer.step()


       if i%10==0: # monitoring
         # make eval return acc, so it can be tracked.
         #eval(model, test_iterator)
          model.train()  # revert it back to train mode.
          global_step = e*num_steps + i
          writer.add_scalar("Training "  + model.module.bert.config.name_or_path + " Loss/Step", loss.item(), global_step)
          accuracy = eval(model, test_iterator)
          writer.add_scalar("Training " + model.module.bert.config.name_or_path + " Accuracy/Step %", accuracy, global_step)


In [None]:
from transformers import BertModel

In [None]:
#sostituito device variable con cuda, ho problemi per far si che tutti i tensori sia su lo stesso
#bert_model_names = ['bert-base-uncased', 'bert-base-cased', 'bert-large-uncased', 'bert-large-cased']
bert_model_names = ['bert-large-uncased', 'bert-large-cased']
bert_models = {}
for name in bert_model_names:
    Bert= BertModel.from_pretrained(name)
    bert_models[name] = Net(vocab_size=len(tag2idx), bertmodel=Bert)
    bert_models[name].to(device)
    bert_models[name] = nn.DataParallel(bert_models[name])

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

In [None]:
train_dataset = PosDataset(train_data,tokenizer)
eval_dataset = PosDataset(test_data,tokenizer)

train_iter = data.DataLoader(dataset=train_dataset,
                             batch_size=8,
                             shuffle=True,
                             num_workers=1,
                             collate_fn=pad,
                             pin_memory=True)
test_iter = data.DataLoader(dataset=eval_dataset,
                             batch_size=8,
                             shuffle=False,
                             num_workers=1,
                             collate_fn=pad,
                            pin_memory=True)
optimizers = {}
for name in bert_model_names:
  optimizers[name] = optim.Adam(bert_models[name].module.parameters(), lr = 0.0001)

criterion = nn.CrossEntropyLoss(ignore_index=0).to(device)


In [None]:
for words, x, is_heads, tags, y, seqlen in train_iter:
  print(model(x))

In [None]:
import datetime
import tensorflow as tf


In [None]:
for name in bert_model_names:
  torch.cuda.empty_cache()
  train(bert_models[name], train_iter, optimizers[name], criterion)
  eval(bert_models[name], test_iter)




In [None]:
%tensorboard --logdir=./logs



Check the result.

In [None]:
open('result', 'r').read().splitlines()[:100]