In [1]:
from transformers import BertTokenizer, BertModel
from tokenizers import trainers
import torch
import pandas as pd
from torch.utils.data import DataLoader
import torch.optim as optim
import numpy as np
from unidecode import unidecode
import collections
import torch.nn as nn
from torch.utils.data import Dataset
import torch.nn.functional as F
import functools
import pandas as pd
from tqdm import tqdm
import torchtext
import conllu
import re
import matplotlib.pyplot as plt

In [2]:
tokenizer = BertTokenizer.from_pretrained("bert-base-multilingual-cased")
bert_model = BertModel.from_pretrained("bert-base-multilingual-cased")

bert_default_vocab = tokenizer.get_vocab().keys()


Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [3]:
english_train = open("UD_English-ParTUT/en_partut-ud-train.conllu", "r", encoding="utf-8") 
english_test = open("UD_English-ParTUT/en_partut-ud-test.conllu", "r", encoding="utf-8") 
english_dev = open("UD_English-ParTUT/en_partut-ud-dev.conllu", "r", encoding="utf-8") 

eng_train_data = conllu.parse(english_train.read())  # ['id', 'form', 'lemma', 'upostag', 'xpostag', 'feats', 'head', 'deprel', 'deps', 'misc']
eng_train_sents = [[unidecode(token['form']) for token in sentence] for sentence in eng_train_data]
eng_u_train_tags = [[token['upostag'] for token in sentence] for sentence in eng_train_data]

In [52]:
def align_tokenizations(sentences, taggings, tokenizer):  
    bert_tokenized_sentences = []
    aligned_taggings = []
    for sentence, tagging in zip(sentences, taggings):
    # first generate BERT-tokenization        
        bert_tokenized_sentence = tokenizer.tokenize(' '.join(sentence))

        aligned_tagging = []
        current_word = ''
        index = 0 # index of current word in sentence and tagging
        for token in bert_tokenized_sentence:
            current_word += re.sub(r'^##', '', token) # recompose word with subtoken
            

      # note that some word factors correspond to unknown words in BERT
            assert token == '[UNK]' or sentence[index].startswith(current_word)  

            if token == '[UNK]' or sentence[index] == current_word: # if we completed a word
                current_word = ''
                aligned_tagging.append(tagging[index])
                index += 1
            else: # otherwise insert padding
                aligned_tagging.append('[PAD]')                
            

        assert len(bert_tokenized_sentence) == len(aligned_tagging)

        bert_tokenized_sentences.append(bert_tokenized_sentence)
        aligned_taggings.append(aligned_tagging)

    return bert_tokenized_sentences, aligned_taggings

In [53]:
aligned_trainedsents, aligned_trainedtags = align_tokenizations(eng_train_sents, eng_u_train_tags, tokenizer)
len_tags = len(set(tag for tags in aligned_trainedtags for tag in tags))

In [56]:
device = torch.device('cpu')
label_vocab = collections.defaultdict(lambda: len(label_vocab))
label_vocab['<pad>'] = 0

def convert_to_ids(sentences, taggings):
    sentences_ids = []
    taggings_ids = []
    for sentence, tagging in zip(sentences, taggings):
        sentence_tensor = torch.tensor(tokenizer.convert_tokens_to_ids(['[CLS]'] + sentence + ['SEP'])).long()
        tagging_tensor = torch.tensor([0] + [label_vocab[tag] for tag in tagging] + [0]).long()

        sentences_ids.append(sentence_tensor.to(device))
        taggings_ids.append(tagging_tensor.to(device))
    return sentences_ids, taggings_ids

def collate_fn(items):
    max_len = max(len(item[0]) for item in items)

    sentences = torch.zeros((len(items), max_len), device=items[0][0].device).long().to(device)
    taggings = torch.zeros((len(items), max_len)).long().to(device)

    for i, (sentence, tagging) in enumerate(items):
        sentences[i][:len(sentence)] = sentence
        taggings[i][:len(tagging)] = tagging

    return sentences, taggings

In [57]:
class PosTaggingDataset(Dataset):
    def __init__(self, sentences, taggings):
        assert len(sentences) == len(taggings)
        self.sentences = sentences
        self.taggings = taggings

    def __getitem__(self, i):
        return self.sentences[i], self.taggings[i]

    def __len__(self):
        return len(self.sentences)

In [62]:
#train_dataloader = DataLoader(list(zip(aligned_trainedsents, aligned_trainedtags)), batch_size=64, shuffle=True)
sent_ids, tag_ids = convert_to_ids(aligned_trainedsents, aligned_trainedtags)
train_dataloader = DataLoader(PosTaggingDataset(sent_ids, tag_ids), batch_size=64, collate_fn=collate_fn, shuffle=True)

In [64]:
for x, y in train_dataloader:
    print(x.shape)
    print(y.shape)    
    break

torch.Size([64, 151])
torch.Size([64, 151])


In [11]:
train_features, train_labels = next(iter(train_dataloader))
print(f"Feature batch shape: {train_features.size()}")
print(f"Labels batch shape: {train_labels.size()}")
img = train_features[0].squeeze()
label = train_labels[0]

print(f"Label: {label}")

Feature batch shape: torch.Size([64, 66])
Labels batch shape: torch.Size([64, 66])
Label: tensor([ 0,  2,  3,  1,  2,  3,  1,  7,  8,  6,  2,  8, 10,  6,  2,  3,  8,  9,
         9,  8,  1,  2,  9,  9,  8,  7,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0])


In [66]:
class LinearProbeBert(nn.Module):
    def __init__(self, num_labels):
        super().__init__()
        self.bert = BertModel.from_pretrained("bert-base-multilingual-cased")
        self.probe = nn.Linear(self.bert.config.hidden_size, num_labels)
        self.to(device)

    def parameters(self):
        return self.probe.parameters()
  
    def forward(self, sentences):
        with torch.no_grad(): # no training of BERT parameters
            word_rep, sentence_rep = self.bert(sentences, return_dict=False)
        return self.probe(word_rep)
    
def perf(model, loader):
    criterion = nn.CrossEntropyLoss()
    model.eval() # do not apply training-specific steps such as dropout
    total_loss = correct = num_loss = num_perf = 0
    for x, y in loader:
        with torch.no_grad(): # no need to store computation graph for gradients
      # perform inference and compute loss
            y_scores = model(x)
            loss = criterion(y_scores.view(-1, len(label_vocab)), y.view(-1)) # requires tensors of shape (num-instances, num-labels) and (num-instances)

      # gather loss statistics
            total_loss += loss.item()
            num_loss += 1

      # gather accuracy statistics
            y_pred = torch.max(y_scores, 2)[1] # compute highest-scoring tag
            mask = (y != 0) # ignore <pad> tags
            correct += torch.sum((y_pred == y) * mask) # compute number of correct predictions
            num_perf += torch.sum(mask).item()
    return total_loss / num_loss, correct.item() / num_perf

# without training, accuracy should be a bit less than 2% (chance of getting a label correct)
perf(LinearProbeBert(len(label_vocab)), train_dataloader)

Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


(3.3657113143375943, 0.03513685297739642)