In [None]:
from transformers import BertTokenizer, BertModel
from tokenizers import trainers
import torch
import torch.nn.functional as F
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
#from torchsummary import summary
import pandas as pd
from tqdm import tqdm
from conllu import parse_incr
import re

In [None]:
class BERTDataset(Dataset):
    def __init__(self, path, txt_file, tokenizer, max_length):
        super(BERTDataset, self).__init__()
        self.path = path
        self.train_set = pd.read_csv(txt_file, delimiter='\t', header=None, index_col=None)
        self.train_set.drop(0, inplace=True, axis=1)
        self.max_length = max_length
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.train_set)
    def __getitem__(self, index):
        sent_1 = self.train_set.iloc[index]
        # print(sent_1) for debugging
        inputs = self.tokenizer.encode_plus(sent_1, truncation=True, max_length=self.max_length, return_attention_mask=True, return_tensors="pt")
        ids = inputs["input_ids"]
        token_type_ids = inputs["token_type_ids"]
        mask = inputs["attention_mask"]
        return {"ids": torch.tensor(ids, dtype=torch.long), "mask": torch.tensor(mask, dtype=torch.long), "token_type_ids": torch.tensor(token_type_ids, dtype=torch.long)}

In [None]:
eval_file = open("UD_Faroese-OFT/fo_oft-ud-test.conllu", "r", encoding="utf-8")  # https://www.youtube.com/watch?v=lvJRFMvWtFI
faroese_oft = [sent for sent in parse_incr(eval_file)]

def read_conll(input_file):
    regexSent = re.compile(r"^#\stext\s=\s")
    text_oft = list()
    for line in open(input_file, encoding="utf-8"):
        if line.startswith("# text ="):
            text_oft.append(regexSent.sub('', line))
    return text_oft

texts = "\n".join(read_conll("UD_Faroese-OFT/fo_oft-ud-test.conllu"))
texts = tokenizer.tokenize(texts)

In [None]:
corpus_file = "fao_wikipedia_2021_30K-sentences.txt"

f = open(corpus_file, 'r', encoding="utf-8")
faroese_Regex = re.compile(r"^\d+\s+")
faroese_sents = [faroese_Regex.sub('', sent) for sent in f.readlines()]  # for faroese
faroese_words = [sent.split() for sent in faroese_sents]
punc_tokens = ['“', '”', '´', '`', '–', '‐', '’', '‘', '—', '…']
faroese_words = [word for sent in faroese_words for word in sent]
f.close()

tokenizer = BertTokenizer.from_pretrained("bert-base-multilingual-cased")
dataset = BERTDataset('.', corpus_file, tokenizer, max_length=100)
dataloader = DataLoader(dataset=dataset, batch_size=32)

In [None]:
class fineBERT(torch.nn.Module): 
    def __init__(self):
        super(fineBERT, self).__init__()
        self.bert_model = BertModel.from_pretrained("bert-base-multilingual-cased")
        self.out = torch.nn.Linear(768, 1)  # update layer size

    def forward(self, ids, mask, token_type_ids):
        _, output = self.bert_model(ids, attention_mask=mask, token_type_ids=token_type_ids, return_dict=False)
        return self.out(output)
    

loss_fn = torch.nn.BCEWithLogitsLoss()

model = fineBERT()
optimizer= optim.Adam(model.parameters(),lr= 0.0001)

for param in model.bert_model.parameters():
        param.requires_grad = False