<a href="https://colab.research.google.com/github/cdrann/random/blob/main/Pos_tagging_with_Bert_Fine_tuning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

[BERT](https://arxiv.org/abs/1810.04805) is known to be good at Sequence tagging tasks like Named Entity Recognition. Let's see if it's true for POS-tagging.

In [None]:
pip install pytorch-pretrained-bert

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pytorch-pretrained-bert
  Downloading pytorch_pretrained_bert-0.6.2-py3-none-any.whl (123 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m123.8/123.8 KB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
Collecting boto3
  Downloading boto3-1.26.94-py3-none-any.whl (135 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m135.1/135.1 KB[0m [31m939.8 kB/s[0m eta [36m0:00:00[0m
Collecting s3transfer<0.7.0,>=0.6.0
  Downloading s3transfer-0.6.0-py3-none-any.whl (79 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m79.6/79.6 KB[0m [31m6.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting botocore<1.30.0,>=1.29.94
  Downloading botocore-1.29.94-py3-none-any.whl (10.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.5/10.5 MB[0m [31m17.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting jmespath<2.0.0,>=0.7.1
  Downloa

In [None]:
import os
from tqdm import tqdm_notebook as tqdm
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim

from torch.utils import data
from pytorch_pretrained_bert import BertTokenizer

In [None]:
torch.__version__

'1.13.1+cu116'

# Data preparation

Thanks to the great NLTK, we don't have to worry about datasets. Some of Penn Tree Banks are included in it. I believe they serves for the purpose.

In [None]:
import nltk

nltk.download('treebank')

[nltk_data] Downloading package treebank to /root/nltk_data...
[nltk_data]   Unzipping corpora/treebank.zip.


True

In [None]:
import nltk

# The Penn Treebank Corpus:
tagged_sents = nltk.corpus.treebank.tagged_sents()
len(tagged_sents)

3914

In [None]:
tagged_sents[0]

[('Pierre', 'NNP'),
 ('Vinken', 'NNP'),
 (',', ','),
 ('61', 'CD'),
 ('years', 'NNS'),
 ('old', 'JJ'),
 (',', ','),
 ('will', 'MD'),
 ('join', 'VB'),
 ('the', 'DT'),
 ('board', 'NN'),
 ('as', 'IN'),
 ('a', 'DT'),
 ('nonexecutive', 'JJ'),
 ('director', 'NN'),
 ('Nov.', 'NNP'),
 ('29', 'CD'),
 ('.', '.')]

In [None]:
tags = list(set(word_pos[1] for sent in tagged_sents for word_pos in sent))

In [None]:
",".join(tags)

"-NONE-,VB,NNS,PRP,PRP$,JJR,LS,JJ,#,VBZ,EX,$,VBN,RP,WRB,WP,FW,POS,VBG,DT,NNPS,NN,CC,MD,:,NNP,VBD,RBR,RBS,TO,JJS,WDT,WP$,VBP,RB,-LRB-,PDT,.,CD,,,'',IN,-RRB-,UH,``,SYM"

In [None]:
# By convention, the 0'th slot is reserved for padding.
tags = ["<pad>"] + tags

In [None]:
tag2idx = {tag:idx for idx, tag in enumerate(tags)}
idx2tag = {idx:tag for idx, tag in enumerate(tags)}

In [None]:
# Let's split the data into train and test (or eval)
from sklearn.model_selection import train_test_split
train_data, test_data = train_test_split(tagged_sents, test_size=.1)
len(train_data), len(test_data)

(3522, 392)

In [None]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [None]:
device

'cpu'

# Data loader


In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-cased', do_lower_case=False)

100%|██████████| 213450/213450 [00:00<00:00, 3297292.22B/s]


In [None]:
class PosDataset(data.Dataset):
    def __init__(self, tagged_sents):
        sents, tags_li = [], [] # list of lists
        for sent in tagged_sents:
            words = [word_pos[0] for word_pos in sent]
            tags = [word_pos[1] for word_pos in sent]
            sents.append(["[CLS]"] + words + ["[SEP]"])
            tags_li.append(["<pad>"] + tags + ["<pad>"])
        self.sents, self.tags_li = sents, tags_li

    def __len__(self):
        return len(self.sents)

    def __getitem__(self, idx):
        words, tags = self.sents[idx], self.tags_li[idx] # words, tags: string list

        # We give credits only to the first piece.
        x, y = [], [] # list of ids
        is_heads = [] # list. 1: the token is the first piece of a word
        for w, t in zip(words, tags):
            tokens = tokenizer.tokenize(w) if w not in ("[CLS]", "[SEP]") else [w]
            xx = tokenizer.convert_tokens_to_ids(tokens)

            is_head = [1] + [0]*(len(tokens) - 1)

            t = [t] + ["<pad>"] * (len(tokens) - 1)  # <PAD>: no decision
            yy = [tag2idx[each] for each in t]  # (T,)

            x.extend(xx)
            is_heads.extend(is_head)
            y.extend(yy)

        assert len(x)==len(y)==len(is_heads), "len(x)={}, len(y)={}, len(is_heads)={}".format(len(x), len(y), len(is_heads))

        # seqlen
        seqlen = len(y)

        # to string
        words = " ".join(words)
        tags = " ".join(tags)
        return words, x, is_heads, tags, y, seqlen


In [None]:
def pad(batch):
    '''Pads to the longest sample'''
    f = lambda x: [sample[x] for sample in batch]
    words = f(0)
    is_heads = f(2)
    tags = f(3)
    seqlens = f(-1)
    maxlen = np.array(seqlens).max()

    f = lambda x, seqlen: [sample[x] + [0] * (seqlen - len(sample[x])) for sample in batch] # 0: <pad>
    x = f(1, maxlen)
    y = f(-2, maxlen)


    f = torch.LongTensor

    return words, f(x), is_heads, tags, f(y), seqlens

# Model

In [None]:
from pytorch_pretrained_bert import BertModel

In [None]:
class Net(nn.Module):
    def __init__(self, vocab_size=None):
        super().__init__()
        self.bert = BertModel.from_pretrained('bert-base-cased')

        self.fc = nn.Linear(768, vocab_size)
        self.device = device

    def forward(self, x, y):
        '''
        x: (N, T). int64
        y: (N, T). int64
        '''
        x = x.to(device)
        y = y.to(device)
        
        if self.training:
            self.bert.train()
            encoded_layers, _ = self.bert(x)
            enc = encoded_layers[-1]
        else:
            self.bert.eval()
            with torch.no_grad():
                encoded_layers, _ = self.bert(x)
                enc = encoded_layers[-1]
        
        logits = self.fc(enc)
        y_hat = logits.argmax(-1)
        return logits, y, y_hat

# Train an evaluate

In [None]:
def train(model, iterator, optimizer, criterion):
    model.train()
    for i, batch in enumerate(iterator):
        words, x, is_heads, tags, y, seqlens = batch
        _y = y # for monitoring
        optimizer.zero_grad()
        logits, y, _ = model(x, y) # logits: (N, T, VOCAB), y: (N, T)

        logits = logits.view(-1, logits.shape[-1]) # (N*T, VOCAB)
        y = y.view(-1)  # (N*T,)

        loss = criterion(logits, y)
        loss.backward()

        optimizer.step()

        if i%10==0: # monitoring
            print("step: {}, loss: {}".format(i, loss.item()))

In [None]:
def eval(model, iterator):
    model.eval()

    Words, Is_heads, Tags, Y, Y_hat = [], [], [], [], []
    with torch.no_grad():
        for i, batch in enumerate(iterator):
            words, x, is_heads, tags, y, seqlens = batch

            _, _, y_hat = model(x, y)  # y_hat: (N, T)

            Words.extend(words)
            Is_heads.extend(is_heads)
            Tags.extend(tags)
            Y.extend(y.numpy().tolist())
            Y_hat.extend(y_hat.cpu().numpy().tolist())

    ## gets results and save
    with open("result", 'w') as fout:
        for words, is_heads, tags, y_hat in zip(Words, Is_heads, Tags, Y_hat):
            y_hat = [hat for head, hat in zip(is_heads, y_hat) if head == 1]
            preds = [idx2tag[hat] for hat in y_hat]
            assert len(preds)==len(words.split())==len(tags.split())
            for w, t, p in zip(words.split()[1:-1], tags.split()[1:-1], preds[1:-1]):
                fout.write("{} {} {}\n".format(w, t, p))
            fout.write("\n")
            
    ## calc metric
    y_true =  np.array([tag2idx[line.split()[1]] for line in open('result', 'r').read().splitlines() if len(line) > 0])
    y_pred =  np.array([tag2idx[line.split()[2]] for line in open('result', 'r').read().splitlines() if len(line) > 0])

    acc = (y_true==y_pred).astype(np.int32).sum() / len(y_true)

    print("acc=%.2f"%acc)


## Load model and train

In [None]:
model = Net(vocab_size=len(tag2idx))
model.to(device)
model = nn.DataParallel(model)

100%|██████████| 404400730/404400730 [00:10<00:00, 37145758.46B/s]


In [None]:
train_dataset = PosDataset(train_data)
eval_dataset = PosDataset(test_data)

train_iter = data.DataLoader(dataset=train_dataset,
                             batch_size=8,
                             shuffle=True,
                             num_workers=1,
                             collate_fn=pad)
test_iter = data.DataLoader(dataset=eval_dataset,
                             batch_size=8,
                             shuffle=False,
                             num_workers=1,
                             collate_fn=pad)

optimizer = optim.Adam(model.parameters(), lr = 0.0001)

criterion = nn.CrossEntropyLoss(ignore_index=0)

In [None]:
train(model, train_iter, optimizer, criterion)
eval(model, test_iter)


step: 0, loss: 3.8685035705566406
step: 10, loss: 1.7828030586242676
step: 20, loss: 0.7950708866119385
step: 30, loss: 0.4256606996059418
step: 40, loss: 0.23999132215976715
step: 50, loss: 0.19955430924892426
step: 60, loss: 0.3931425213813782
step: 70, loss: 0.08938563615083694
step: 80, loss: 0.18984177708625793
step: 90, loss: 0.13247619569301605
step: 100, loss: 0.19631338119506836
step: 110, loss: 0.09652449190616608
step: 120, loss: 0.1010337769985199
step: 130, loss: 0.08772258460521698
step: 140, loss: 0.1630076766014099
step: 150, loss: 0.14914600551128387
step: 160, loss: 0.044792525470256805
step: 170, loss: 0.06629658490419388
step: 180, loss: 0.09267449378967285
step: 190, loss: 0.12260664999485016
step: 200, loss: 0.14829736948013306
step: 210, loss: 0.11935346573591232
step: 220, loss: 0.1254226416349411
step: 230, loss: 0.12320947647094727
step: 240, loss: 0.10381089150905609
step: 250, loss: 0.06362539529800415
step: 260, loss: 0.1427251547574997
step: 270, loss: 0.1

In [None]:
eval(model, test_iter)


acc=0.97


Check the result.

In [None]:
open('result', 'r').read().splitlines()[:100]

['Meanwhile RB RB',
 ', , ,',
 'the DT DT',
 'National NNP NNP',
 'Association NNP NNP',
 'of IN IN',
 'Purchasing NNP NNP',
 'Management NNP NNP',
 'said VBD VBD',
 '0 -NONE- -NONE-',
 'its PRP$ PRP$',
 'latest JJS JJS',
 'survey NN NN',
 'indicated VBD VBD',
 'that IN IN',
 'the DT DT',
 'manufacturing NN NN',
 'economy NN NN',
 'contracted VBD VBN',
 'in IN IN',
 'October NNP NNP',
 'for IN IN',
 'the DT DT',
 'sixth JJ JJ',
 'consecutive JJ JJ',
 'month NN NN',
 '. . .',
 '',
 'Lancaster NNP NNP',
 'Colony NNP NNP',
 'Corp. NNP NNP',
 'said VBD VBD',
 '0 -NONE- -NONE-',
 'it PRP PRP',
 'acquired VBD VBD',
 'Reames NNP NNP',
 'Foods NNP NNPS',
 'Inc. NNP NNP',
 'in IN IN',
 'a DT DT',
 'cash NN NN',
 'transaction NN NN',
 '. . .',
 '',
 'Weatherly NNP NNP',
 'Securities NNP NNPS',
 'Corp. NNP NNP',
 ', , ,',
 'New NNP NNP',
 'York NNP NNP',
 ', , ,',
 'and CC CC',
 'three CD CD',
 'of IN IN',
 'its PRP$ PRP$',
 'principals NNS NNS',
 '-- : :',
 'Dell NNP NNP',
 'Eugene NNP NNP',
 'K