In [43]:
from transformers import BertTokenizer, BertModel
from tokenizers import trainers
import torch
import pandas as pd
import collections
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
#from torchsummary import summary
import pandas as pd
from tqdm import tqdm
import conllu
import re

In [2]:
class BERTDataset(Dataset):
    def __init__(self, path, txt_file, tokenizer, max_length):
        super(BERTDataset, self).__init__()
        self.path = path
        self.train_set = pd.read_csv(txt_file, delimiter='\t', header=None, index_col=None)
        self.train_set.drop(0, inplace=True, axis=1)
        self.max_length = max_length
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.train_set)
    def __getitem__(self, index):
        sent_1 = self.train_set.iloc[index]
        # print(sent_1) for debugging
        inputs = self.tokenizer.encode_plus(sent_1, truncation=True, max_length=self.max_length, return_attention_mask=True, return_tensors="pt")
        ids = inputs["input_ids"]
        token_type_ids = inputs["token_type_ids"]
        mask = inputs["attention_mask"]
        return {"ids": torch.tensor(ids, dtype=torch.long), "mask": torch.tensor(mask, dtype=torch.long), "token_type_ids": torch.tensor(token_type_ids, dtype=torch.long)}

Test set clean up. We are using the universal taggings

In [26]:
eval_file = open("UD_Faroese-OFT/fo_oft-ud-test.conllu", "r", encoding="utf-8") 
data = conllu.parse(eval_file.read())  # ['id', 'form', 'lemma', 'upostag', 'xpostag', 'feats', 'head', 'deprel', 'deps', 'misc']
sentences = [[token['form'] for token in sentence] for sentence in data]
x_taggings = [[token['xpostag'] for token in sentence] for sentence in data]  # weird tags
u_taggings = [[token['upostag'] for token in sentence] for sentence in data]

# faroese_oft = [sent for sent in conllu.parse_incr(eval_file)]
print(sentences[-4])
print(x_taggings[-4])
print(u_taggings[-4])

['Í', '2016', 'væntar', 'oyggin', 'at', 'fáa', 'yvir', 'tíggju', 'milliónir', 'ferðafólk', 'á', 'vitjan', '.']
['Pr', 'Num', 'V', 'N', 'IM', 'V', 'Pr', 'Num', 'N', 'N', 'Pr', 'N', 'CLB']
['ADP', 'NUM', 'VERB', 'NOUN', 'PART', 'VERB', 'ADP', 'NUM', 'NOUN', 'NOUN', 'ADP', 'NOUN', 'PUNCT']


Vocab Augmentation

In [4]:
corpus_file = "fao_wikipedia_2021_30K-sentences.txt"

f = open(corpus_file, 'r', encoding="utf-8")
faroese_Regex = re.compile(r"^\d+\s+")
faroese_sents = [faroese_Regex.sub('', sent) for sent in f.readlines()]  # for faroese
punctuation = {0x2018:0x27, 0x2019:0x27, 0x201C:0x22, 0x201D:0x22, 0x2013:0x2D, 0x2010:0x2D, 0x2014:0x2D, 0x2026:0x85}
faroese_sents = [sent.translate(punctuation) for sent in faroese_sents]

faroese_words = [sent.split() for sent in faroese_sents]
faroese_words = [word for sent in faroese_words for word in sent]
f.close()
train_corpus = "".join(faroese_sents)
#tokenizer = BertTokenizer.from_pretrained("bert-base-multilingual-cased", do_lower_case=False)
tokenizer = BertTokenizer.from_pretrained("bert-base-multilingual-cased")

bert_model = BertModel.from_pretrained("bert-base-multilingual-cased")

long_tokens = list()
for w in faroese_words: 
    single_tokens = tokenizer.tokenize(w)
    long_tokens.append((w, single_tokens))

longest = [subwords[0] for subwords in long_tokens if len(subwords[1]) >= 11]  # try segment from 10 or higher and clean web addresses as well as () and '
# 12 subtokens == 44 words
# 11 subtokens == 96 words

bert_vocab = tokenizer.get_vocab().keys()
words_in_bert = set([word in bert_vocab for word in longest])
longest = [word for word in longest if word[:4] != "http"]
foreign_tokensRegex = re.compile(r"^(\(|'|\")")
subword_tokens = [word for word in longest if not foreign_tokensRegex.search(word)] 
special_tokens_dict = {'additional_special_tokens': subword_tokens}
num_added_toks = tokenizer.add_special_tokens(special_tokens_dict)

# Resizing

bert_model.resize_token_embeddings(len(tokenizer)) 

# dataset = BERTDataset('.', corpus_file, tokenizer, max_length=100)
# dataloader = DataLoader(dataset=dataset, batch_size=32)



Downloading:   0%|          | 0.00/972k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/625 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/681M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Embedding(119621, 768)

In [None]:
# Tokenizing whole text UNNEEDED

#text = pd.read_csv()
#tokenised_text = tokenizer(text, return_tensors='pt', padding=True, truncation=True, max_length=512)

In [28]:
def align_tokenizations(sentences, taggings):  # https://pageperso.lis-lab.fr/benoit.favre/pstaln/09_embedding_evaluation.html
    bert_tokenized_sentences = []
    aligned_taggings = []

    for sentence, tagging in zip(sentences, taggings):
    # first generate BERT-tokenization        
        bert_tokenized_sentence = tokenizer.tokenize(' '.join(sentence))

        aligned_tagging = []
        current_word = ''
        index = 0 # index of current word in sentence and tagging
        for token in bert_tokenized_sentence:
            current_word += re.sub(r'^##', '', token) # recompose word with subtoken
            #sentence[index] = sentence[index].replace('\xad', '') # fix bug in data

      # note that some word factors correspond to unknown words in BERT
            assert token == '[UNK]' or sentence[index].startswith(current_word)  

            if token == '[UNK]' or sentence[index] == current_word: # if we completed a word
                current_word = ''
                aligned_tagging.append(tagging[index])
                index += 1
            else: # otherwise insert padding
                aligned_tagging.append('[PAD]')                
            

        assert len(bert_tokenized_sentence) == len(aligned_tagging)

        bert_tokenized_sentences.append(bert_tokenized_sentence)
        aligned_taggings.append(aligned_tagging)

    return bert_tokenized_sentences, aligned_taggings

aligned_sents, aligned_tags = align_tokenizations(sentences, u_taggings)

In [31]:
device = torch.device('cuda' if torch.cuda.is_available else 'cpu')
label_vocab = collections.defaultdict(lambda: len(label_vocab))
label_vocab['<pad>'] = 0

def convert_to_ids(sentences, taggings):
    sentences_ids = []
    taggings_ids = []
    for sentence, tagging in zip(sentences, taggings):
        sentence_tensor = torch.tensor(tokenizer.convert_tokens_to_ids(['[CLS]'] + sentence + ['SEP'])).long()
        tagging_tensor = torch.tensor([0] + [label_vocab[tag] for tag in tagging] + [0]).long()

        sentences_ids.append(sentence_tensor.to(device))
        taggings_ids.append(tagging_tensor.to(device))
    return sentences_ids, taggings_ids


sent_ids, tag_ids = convert_to_ids(aligned_sents, aligned_tags)
#train_sentences_ids, train_taggings_ids = convert_to_ids(train_bert_tokenized_sentences, train_aligned_taggings)
#valid_sentences_ids, valid_taggings_ids = convert_to_ids(valid_bert_tokenized_sentences, valid_aligned_taggings)
#test_sentences_ids, test_taggings_ids = convert_to_ids(test_bert_tokenized_sentences, test_aligned_taggings)
print(aligned_sents[0])
print(sent_ids[0])  
print(aligned_tags[0])
print(tag_ids[0])  
print()
print(aligned_sents[1])
print(sent_ids[1])  
print(aligned_tags[1])
print(tag_ids[1])  

print('num labels:', len(label_vocab))

['Før', '##oya', '##r', 'eru', 'ikki', 'li', '##mur', 'í', 'ES', ',', 'hóa', '##st', 'Danmark', 'er', '.']
tensor([   101,  58997,  51814,  10129,  18098, 106434,  11614,  33246,    267,
         49004,    117,  20105,  10562,  17271,  10163,    119,    100],
       device='cuda:0')
['[PAD]', '[PAD]', 'PROPN', 'AUX', 'ADV', '[PAD]', 'NOUN', 'ADP', 'NOUN', 'PUNCT', '[PAD]', 'SCONJ', 'PROPN', 'VERB', 'PUNCT']
tensor([0, 1, 1, 2, 3, 4, 1, 5, 6, 5, 7, 1, 8, 2, 9, 7, 0], device='cuda:0')

['Av', '##hal', '##ds', '##rør', '##sla', '##n', 'byrja', '##r', '.']
tensor([  101, 33799, 18453, 13268, 93664, 61432, 10115, 98195, 10129,   119,
          100], device='cuda:0')
['[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', 'NOUN', '[PAD]', 'VERB', 'PUNCT']
tensor([0, 1, 1, 1, 1, 1, 5, 1, 9, 7, 0], device='cuda:0')
num labels: 18


In [38]:
def collate_fn(items):
    max_len = max(len(item[0]) for item in items)

    sentences = torch.zeros((len(items), max_len), device=items[0][0].device).long().to(device)
    taggings = torch.zeros((len(items), max_len)).long().to(device)

    for i, (sentence, tagging) in enumerate(items):
        sentences[i][0:len(sentence)] = sentence
        taggings[i][0:len(tagging)] = tagging

    return sentences, taggings

x, y = collate_fn([[torch.tensor([1, 2, 3]), torch.tensor([4, 5, 6])], [torch.tensor([1, 2]), torch.tensor([3, 4])]])
print(x.shape, y.shape)
print(y)
print()
#x, y = collate_fn(sent_ids, tag_ids)

torch.Size([2, 3]) torch.Size([2, 3])
tensor([[4, 5, 6],
        [3, 4, 0]], device='cuda:0')



In [37]:
from torch.utils.data import Dataset

class PosTaggingDataset(Dataset):
    def __init__(self, sentences, taggings):
        assert len(sentences) == len(taggings)
        self.sentences = sentences
        self.taggings = taggings

    def __getitem__(self, i):
        return self.sentences[i], self.taggings[i]

    def __len__(self):
        return len(self.sentences)

In [41]:
from torch.utils.data import DataLoader

batch_size = 64

eval_loader = DataLoader(PosTaggingDataset(sent_ids, tag_ids), batch_size=batch_size, collate_fn=collate_fn, shuffle=True)

tensor([[  101,   151, 11637,  ...,     0,     0,     0],
        [  101, 33884, 10661,  ...,     0,     0,     0],
        [  101, 36702, 36767,  ...,     0,     0,     0],
        ...,
        [  101,   157,   119,  ...,     0,     0,     0],
        [  101,   148, 26776,  ...,     0,     0,     0],
        [  101, 51874, 20084,  ...,     0,     0,     0]], device='cuda:0') tensor([[0, 1, 4,  ..., 0, 0, 0],
        [0, 1, 6,  ..., 0, 0, 0],
        [0, 1, 9,  ..., 0, 0, 0],
        ...,
        [0, 1, 1,  ..., 0, 0, 0],
        [0, 1, 1,  ..., 0, 0, 0],
        [0, 1, 1,  ..., 0, 0, 0]], device='cuda:0')


In [44]:
class LinearProbeBert(nn.Module):
    def __init__(self, num_labels):
        super().__init__()
        self.bert = BertModel.from_pretrained("bert-base-multilingual-cased")
        self.probe = nn.Linear(self.bert.config.hidden_size, num_labels)
        self.to(device)

    def parameters(self):
        return self.probe.parameters()
  
    def forward(self, sentences):
        with torch.no_grad(): # no training of BERT parameters
            word_rep, sentence_rep = self.bert(sentences, return_dict=False)
        return self.probe(word_rep)

# the model should return a tensor of shape (batch size, sequence length, number of labels)
bert_model = LinearProbeBert(len(label_vocab))
y = bert_model(torch.tensor([[0, 1, 2], [3, 4, 5]]).to(device))
print(y.shape)

Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


torch.Size([2, 3, 18])


In [49]:
class fineBERT(nn.Module): 
    def __init__(self, num_labels):
        super(fineBERT, self).__init__()
        self.bert_model = BertModel.from_pretrained("bert-base-multilingual-cased")
        self.probe = nn.Linear(self.bert_model.config.hidden_size, num_labels)
        self.to(device)

    def forward(self, ids, mask, token_type_ids):
        _, output = self.bert_model(ids, attention_mask=mask, token_type_ids=token_type_ids, return_dict=False)
        return self.out(output)
    

loss_fn = torch.nn.BCEWithLogitsLoss()

model = fineBERT(len(label_vocab))
model.cuda()
#optimizer= torch.optim.Adam(model.parameters(),lr= 0.0001)

for param in model.bert_model.parameters():
        param.requires_grad = False

Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [50]:
model

fineBERT(
  (bert_model): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(119547, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=Tr