In [2]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.24.0-py3-none-any.whl (5.5 MB)
[K     |████████████████████████████████| 5.5 MB 27.5 MB/s 
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[K     |████████████████████████████████| 7.6 MB 54.7 MB/s 
Collecting huggingface-hub<1.0,>=0.10.0
  Downloading huggingface_hub-0.11.0-py3-none-any.whl (182 kB)
[K     |████████████████████████████████| 182 kB 74.1 MB/s 
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.11.0 tokenizers-0.13.2 transformers-4.24.0


In [1]:
!pip install torchtext==0.10.0

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting torchtext==0.10.0
  Downloading torchtext-0.10.0-cp37-cp37m-manylinux1_x86_64.whl (7.6 MB)
[K     |████████████████████████████████| 7.6 MB 14.9 MB/s 
[?25hCollecting torch==1.9.0
  Downloading torch-1.9.0-cp37-cp37m-manylinux1_x86_64.whl (831.4 MB)
[K     |████████████████████████████████| 831.4 MB 2.7 kB/s 
Installing collected packages: torch, torchtext
  Attempting uninstall: torch
    Found existing installation: torch 1.12.1+cu113
    Uninstalling torch-1.12.1+cu113:
      Successfully uninstalled torch-1.12.1+cu113
  Attempting uninstall: torchtext
    Found existing installation: torchtext 0.13.1
    Uninstalling torchtext-0.13.1:
      Successfully uninstalled torchtext-0.13.1
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
torchvision 

import all the required modules

In [3]:
import torch
import torch.nn as nn
import torch.nn.functional as f
import torch.optim as optim


from torchtext.legacy import data
from torchtext.legacy import datasets
from transformers import BertTokenizer, BertModel

import numpy as np
import time
import random
import functools

set the random seeds for reproducability

In [4]:
Seed = 1234

random.seed(Seed)
np.random.seed(Seed)
torch.manual_seed(Seed)
torch.backends.cudnn.deterministic = True

import the BERT tokenizer

In [5]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

get the special tokens

In [6]:
init_token = tokenizer.cls_token
pad_token = tokenizer.pad_token
unk_token = tokenizer.unk_token

get the indexes of the special tokens

In [7]:
init_token_idx = tokenizer.convert_tokens_to_ids(init_token)
pad_token_idx = tokenizer.convert_tokens_to_ids(pad_token)
unk_token_idx = tokenizer.convert_tokens_to_ids(unk_token)

get the maximum length of the pretrained model

In [8]:
max_input_length = tokenizer.max_model_input_sizes['bert-base-cased']

define a function to cut the sequence of tokens to the desired maximum length and then convert the tokens into indexes

In [18]:
def cut_and_convert_to_id(tokens,tokenizer,max_input_length):
  tokens = tokens[:max_input_length-1]
  tokens = tokenizer.convert_tokens_to_ids(tokens)
  return tokens

define a function to cuts the sequence to the maximum length

In [23]:
def cut_to_max_length(tokens,max_input_length):
  tokens = tokens[:max_input_length-1]
  return tokens

We make use of Python's functools that allow us to pass functions

In [24]:
text_preprocessor = functools.partial(cut_and_convert_to_id,tokenizer = tokenizer, max_input_length = max_input_length)

tag_preprocessor = functools.partial(cut_to_max_length, max_input_length = max_input_length)


define the fields

In [25]:
TEXT = data.Field(use_vocab= False,lower = True,preprocessing = text_preprocessor,init_token = init_token_idx
                  ,pad_token = pad_token_idx , unk_token = unk_token_idx)


UD_TAGS = data.Field(unk_token = None,init_token = '<pad>',preprocessing = tag_preprocessor)

define which of our fields defined above correspond to which fields in the dataset

In [47]:
fields = (("text",TEXT),("udtags",UD_TAGS))

load the data using our fields

In [48]:
train_data,valid_data,test_data = datasets.UDPOS.splits(fields)

In [49]:
print(vars(train_data.examples[0]))

{'text': [2632, 1011, 100, 1024, 2137, 2749, 2730, 100, 14093, 2632, 1011, 100, 1010, 1996, 14512, 2012, 1996, 8806, 1999, 1996, 2237, 1997, 100, 1010, 2379, 1996, 9042, 3675, 1012], 'udtags': ['PROPN', 'PUNCT', 'PROPN', 'PUNCT', 'ADJ', 'NOUN', 'VERB', 'PROPN', 'PROPN', 'PROPN', 'PUNCT', 'PROPN', 'PUNCT', 'DET', 'NOUN', 'ADP', 'DET', 'NOUN', 'ADP', 'DET', 'NOUN', 'ADP', 'PROPN', 'PUNCT', 'ADP', 'DET', 'ADJ', 'NOUN', 'PUNCT']}


build the tag vocabulary so they can be numericalized during training

In [50]:
UD_TAGS.build_vocab(train_data)

define the iterators

In [51]:
Batch_size = 32
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

train_iterator,valid_iterator,test_iterator = data.BucketIterator.splits((train_data,valid_data,test_data),
                                                                         batch_size = Batch_size,
                                                                         device = device)

Building the Model

In [30]:
class BERTPOStagger(nn.Module):
  def __init__(self,bert,output_dim,dropout):

    super().__init__()

    self.bert = bert
    emb_dim = bert.config.to_dict()['hidden_size']
    self.fc = nn.Linear(emb_dim,output_dim)
    self.dropout = nn.Dropout(dropout)

  def forward(self,text):
    
    text = text.permute(1,0)
    embedded = self.dropout(self.bert(text)[0])
    embedded = embedded.permute(1,0,2)
    predictions = self.fc(self.dropout(embedded))

    return predictions


load the actual pretrained BERT uncased model 

In [31]:
bert = BertModel.from_pretrained('bert-base-uncased')

Downloading:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


set the hyperparameter

In [32]:
OUTPUT_DIM = len(UD_TAGS.vocab)
DROPOUT = 0.25

model = BERTPOStagger(bert,OUTPUT_DIM,DROPOUT)

define our optimizer

In [33]:
learning_rate = 5e-5

optimizer = optim.Adam(model.parameters(),lr = learning_rate)

define a loss function

In [34]:
TAG_PAD_IDX = UD_TAGS.vocab.stoi[UD_TAGS.pad_token]

criterion = nn.CrossEntropyLoss(ignore_index = TAG_PAD_IDX)

place the model on to the GPU

In [35]:
model = model.to(device)
criterion = criterion.to(device)

define a function which calculates our accuracy of predicting tags

In [36]:
def categorical_accuracy(preds, y, tag_pad_idx):
    
    max_preds = preds.argmax(dim = 1, keepdim = True) 
    non_pad_elements = (y != tag_pad_idx).nonzero()
    correct = max_preds[non_pad_elements].squeeze(1).eq(y[non_pad_elements])
    return correct.sum() / torch.FloatTensor([y[non_pad_elements].shape[0]]).to(device)

define the train function

In [55]:
def train(model,iterator,optimizer,criterion,tag_pad_idx):
  epoch_loss = 0
  epoch_acc = 0
  model.train()
  for batch in iterator:
    text = batch.text
    udtag = batch.udtags
    optimizer.zero_grad()
    predictions = model(text)
    predictions = predictions.view(-1,predictions.shape[-1])
    udtag = udtag.view(-1)
    loss = criterion(predictions,udtag)
    acc = categorical_accuracy(predictions,udtag,tag_pad_idx)
    loss.backward()
    optimizer.step()

    epoch_loss += loss.item()
    epoch_acc += acc.item()

  return epoch_loss / len(iterator), epoch_acc / len(iterator)  



define the evaluation function

In [38]:
def evaluate(model, iterator, criterion, tag_pad_idx):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.eval()
    
    with torch.no_grad():
    
        for batch in iterator:

            text = batch.text
            udtags = batch.udtags
            
            predictions = model(text)
            
            predictions = predictions.view(-1, predictions.shape[-1])
            udtags = udtags.view(-1)
            
            loss = criterion(predictions, udtags)
            
            acc = categorical_accuracy(predictions, udtags, tag_pad_idx)

            epoch_loss += loss.item()
            epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

define a function to calculate the epoch time

In [40]:
def epoch_time(start_time, end_time):
    elap_time = end_time - start_time
    elap_mins = int(elap_time / 60)
    elap_secs = int(elap_time - (elap_mins * 60))
    return elap_mins, elap_secs

train the model

In [56]:
N_EPOCHS = 10

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):

    start_time = time.time()
    
    train_loss, train_acc = train(model, train_iterator, optimizer, criterion, TAG_PAD_IDX)
    valid_loss, valid_acc = evaluate(model, valid_iterator, criterion, TAG_PAD_IDX)
    
    end_time = time.time()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'tut2-model.pt')
    
    print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val Loss: {valid_loss:.3f} |  Val Acc: {valid_acc*100:.2f}%')

Epoch: 01 | Epoch Time: 2m 8s
	Train Loss: 0.399 | Train Acc: 88.54%
	 Val Loss: 0.312 |  Val Acc: 90.09%
Epoch: 02 | Epoch Time: 2m 9s
	Train Loss: 0.120 | Train Acc: 96.55%
	 Val Loss: 0.288 |  Val Acc: 91.30%
Epoch: 03 | Epoch Time: 2m 8s
	Train Loss: 0.077 | Train Acc: 97.77%
	 Val Loss: 0.270 |  Val Acc: 92.28%
Epoch: 04 | Epoch Time: 2m 10s
	Train Loss: 0.053 | Train Acc: 98.46%
	 Val Loss: 0.267 |  Val Acc: 92.61%
Epoch: 05 | Epoch Time: 2m 8s
	Train Loss: 0.040 | Train Acc: 98.82%
	 Val Loss: 0.271 |  Val Acc: 92.80%
Epoch: 06 | Epoch Time: 2m 9s
	Train Loss: 0.030 | Train Acc: 99.14%
	 Val Loss: 0.304 |  Val Acc: 92.32%
Epoch: 07 | Epoch Time: 2m 9s
	Train Loss: 0.026 | Train Acc: 99.28%
	 Val Loss: 0.315 |  Val Acc: 91.44%
Epoch: 08 | Epoch Time: 2m 10s
	Train Loss: 0.022 | Train Acc: 99.36%
	 Val Loss: 0.303 |  Val Acc: 92.53%
Epoch: 09 | Epoch Time: 2m 9s
	Train Loss: 0.019 | Train Acc: 99.45%
	 Val Loss: 0.304 |  Val Acc: 92.24%
Epoch: 10 | Epoch Time: 2m 8s
	Train Loss: 0

test the model

In [57]:
model.load_state_dict(torch.load('tut2-model.pt'))

test_loss, test_acc = evaluate(model, test_iterator, criterion, TAG_PAD_IDX)

print(f'Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}%')

Test Loss: 0.305 | Test Acc: 90.90%


use the model to tag actual sentences

In [58]:
def tag_sentence(model, device, sentence, tokenizer, text_field, tag_field):
    
    model.eval()
    
    if isinstance(sentence, str):
        tokens = tokenizer.tokenize(sentence)
    else:
        tokens = sentence
    
    numericalized_tokens = tokenizer.convert_tokens_to_ids(tokens)
    numericalized_tokens = [text_field.init_token] + numericalized_tokens
        
    unk_idx = text_field.unk_token
    
    unks = [t for t, n in zip(tokens, numericalized_tokens) if n == unk_idx]
    
    token_tensor = torch.LongTensor(numericalized_tokens)
    
    token_tensor = token_tensor.unsqueeze(-1).to(device)
         
    predictions = model(token_tensor)
    
    top_predictions = predictions.argmax(-1)
    
    predicted_tags = [tag_field.vocab.itos[t.item()] for t in top_predictions]
    
    predicted_tags = predicted_tags[1:]
        
    assert len(tokens) == len(predicted_tags)
    
    return tokens, predicted_tags, unks

run an example sentence through our model and receive the predicted tags

In [59]:
sentence = 'The Queen will deliver a speech about the conflict in North Korea at 1pm tomorrow.'

tokens, tags, unks = tag_sentence(model, 
                                  device, 
                                  sentence,
                                  tokenizer,
                                  TEXT, 
                                  UD_TAGS)


In [60]:
print("Pred Tag\tToken\n")

for token, tag in zip(tokens, tags):
    print(f"{tag}\t\t{token}")

Pred Tag	Token

DET		the
NOUN		queen
AUX		will
VERB		deliver
DET		a
NOUN		speech
ADP		about
DET		the
NOUN		conflict
ADP		in
PROPN		north
PROPN		korea
ADP		at
NUM		1
NOUN		##pm
NOUN		tomorrow
PUNCT		.
