In [94]:
import pandas as pd
import conllu
import gzip
import io
import transformers
from transformers import AutoTokenizer
from torch.utils.data import Dataset
import os
import random
import tqdm
import numpy as np
import torch
import gensim
from transformers import BertForSequenceClassification, AutoTokenizer
from transformers import BertTokenizer
from sklearn.model_selection import train_test_split
from datasets import ClassLabel
from torch.utils.data import DataLoader
from typing import List, Tuple
import warnings
warnings.filterwarnings('ignore')

In [87]:
def seed_everything(seed_value=5550):
    "Set same seed to all random operations for reproduceability purposes"
    os.environ['PYTHONHASHSEED'] = str(seed_value)
    random.seed(seed_value)
    np.random.seed(seed_value)
    torch.manual_seed(seed_value)
    torch.cuda.manual_seed_all(seed_value)

    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    
seed_everything()

In [88]:
def load_embedding(modelfile):
    " Loading the file that is used as the embedding layer in the Neural Network"
    # Detect the model format by its extension:
    # Binary word2vec format:
    if modelfile.endswith(".bin.gz") or modelfile.endswith(".bin"):
        emb_model = gensim.models.KeyedVectors.load_word2vec_format(
            modelfile, binary=True, unicode_errors="replace"
        )
    # Text word2vec format:
    elif (
        modelfile.endswith(".txt.gz")
        or modelfile.endswith(".txt")
        or modelfile.endswith(".vec.gz")
        or modelfile.endswith(".vec")
    ):
        emb_model = gensim.models.KeyedVectors.load_word2vec_format(
            modelfile, binary=False, unicode_errors="replace"
        )
    # ZIP archive from the NLPL vector repository:
    elif modelfile.endswith(".zip"):
        with zipfile.ZipFile(modelfile, "r") as archive:
            stream = archive.open(
                "model.bin"  # or model.txt, if you want to look at the model
            )
            emb_model = gensim.models.KeyedVectors.load_word2vec_format(
                stream, binary=True, unicode_errors="replace"
            )
    else:  # Native Gensim format?
        emb_model = gensim.models.KeyedVectors.load(modelfile)
        #  If you intend to train the model further:
        # emb_model = gensim.models.Word2Vec.load(embeddings_file)
    return emb_model

print("Loading embedding ...")
embedding_path = "data/58/model.bin"
embedding = load_embedding(embedding_path)
embedding["[UNK]"] = torch.tensor(embedding.vectors).mean(dim=0).numpy()
embedding["[PAD]"] = torch.zeros(embedding.vector_size).numpy()
print("Loading embedding done")

Loading embedding ...
Loading embedding done


In [95]:
class CollateFunctor:
    def __init__(self, padding_index: int, max_length: int):
        self.padding_index = padding_index
        self.max_length = max_length

    def __call__(self, samples: List[Tuple[torch.Tensor, torch.Tensor]]) -> Tuple[torch.Tensor, torch.Tensor]:
        input_ids_sentencee = [s for s, y in samples]
        labels = [y for s, y in samples]

        input_ids_padded_sentence = torch.nn.utils.rnn.pad_sequence(
                                    input_ids_sentencee,
                                    batch_first = True,
                                    padding_value = self.padding_index
                                )
        
        input_ids_padded_sentence = input_ids_padded_sentence[:, :self.max_length]
        
        labels = torch.LongTensor(labels)
        
        return input_ids_padded_sentence, labels  

In [89]:
def open_and_read_path(data_path):
    # Load the CoNLL-U file    
    with gzip.open(data_path, "rb") as f:
        data = io.TextIOWrapper(f, encoding="utf-8").read()

    # Parse the CoNLL-U file using conllu library
    parsed_data = conllu.parse(data)

    # Extract the token and named entity label from the CoNLL-U file
    sentences = []
    tags = []
    labels = []
    metadata = []
    for sentence in parsed_data:
        tokens = []
        tokens_tags = []
        token_label = []
        token_metadata = []
        for token in sentence:
            # Extract the token and named entity label
            tokens.append(token['form'])
            tokens_tags.append(token["upos"])
            token_label.append(token['misc']['name'])
            token_metadata.append(token['feats'])
        sentences.append(" ".join(tokens))
        tags.append(" ".join(tokens_tags))
        labels.append(" ".join(token_label))
        metadata.append(token_metadata)

    data_dict = {"sentence" : sentences,
                 "labels" : labels}

    data = pd.DataFrame(data_dict)

    return data

In [90]:
path = 'data/norne-nb-in5550-train.conllu.gz'

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

data = open_and_read_path(path)
data.head()


Unnamed: 0,sentence,labels
0,Rus har en klar sosial dimensjon ; man drikker...,O O O O O O O O O O O O O O O O O O O O O O O O
1,I tillegg skal daglig leder ved Miljøtransport...,O O O O O O B-ORG I-ORG O O O O O O O O O O O ...
2,« Angående vår nærmere lovede undersøkelse på ...,O O O O O O O O O O O O O O O O O O O O O O O ...
3,- I verste fall kan det være dødelig hvis du s...,O O O O O O O O O O O O O O O B-PER O
4,At vi i det hele tatt har det er et resultat a...,O O O O O O O O O O O O O O O O O O O B-LOC O ...


In [6]:
#num_classes = data["labels"].nunique()
num_classes = list(sorted(set(" ".join(list(data['labels'])).split(" "))))

In [7]:
train_df, val_df = train_test_split(data,
                                    train_size=0.7,
                                    random_state=5550)


train_texts = train_df["sentence"].to_list()
text_labels = train_df["labels"].to_list()

tokens = tokenizer(train_texts,
          return_tensors="pt",
          padding=True,
          truncation=True,
          max_length=64)

unk_index = embedding.get_index("[UNK]")

for document in data["sentence"]:
    print(document.split(" "))
    break
    
data["sentence"].str.split(" ")
#sentences[0]

['Rus', 'har', 'en', 'klar', 'sosial', 'dimensjon', ';', 'man', 'drikker', 'for', 'å', 'signalisere', 'noe', 'om', 'hvem', 'man', 'er', ',', 'og', 'hvem', 'man', 'ikke', 'er', '.']


0        [Rus, har, en, klar, sosial, dimensjon, ;, man...
1        [I, tillegg, skal, daglig, leder, ved, Miljøtr...
2        [«, Angående, vår, nærmere, lovede, undersøkel...
3        [-, I, verste, fall, kan, det, være, dødelig, ...
4        [At, vi, i, det, hele, tatt, har, det, er, et,...
                               ...                        
18093                        [-, Bryne, er, et, søppellag]
18094    [I, så, fall, har, Ulsrud, sikret, EM-medalje,...
18095    [Men, det, vil, alltid, ,, som, jeg, har, sagt...
18096    [Vi, vil, finne, løsninger, som, reduserer, ut...
18097    [-, Jeg, kjørte, ikke, spesielt, bra, ,, jeg, ...
Name: sentence, Length: 18098, dtype: object

In [91]:
class EmbDataset(Dataset):
    def __init__(self, data, embedding, label_vocab = None):

        self.unk_index = embedding.get_index("[UNK]")
        self.sentences = [
            [
                embedding.get_index(token.lower(), default = self.unk_index)
                for token in document
            ] 
            for document in data["sentence"].str.split(" ")
        ]
            
        unk_tokens = sum(token == self.unk_index for document in self.sentences for token in document)
        n_tokens = sum(len(document) for document in self.sentences)
        print(f"Percentage of unknown tokens: {unk_tokens / n_tokens * 100.0:.2f}%")
        
        self.label = list(data['labels'])
        self.label_vocab = label_vocab if label_vocab is not None else list(sorted(set(" ".join(self.label).split(" "))))
        self.num_labels = len(self.label_vocab)
        self.label_indexer = {i: n for n, i in enumerate(self.label_vocab)}
        print("\nLabel dictionary:", self.label_indexer)
        
    def __getitem__(self, index):
        current_tokens_sentence = self.sentences[index]
        current_labels = self.label[index]

        sentence = torch.LongTensor(current_tokens_sentence)

        labels = []
        for label in current_labels.split(" "):
            labels.append(self.label_indexer[label])
        
        y = torch.LongTensor(labels)
        return sentence, y

    def __len__(self):
        return len(self.sentences)
    
    

In [111]:
from dataset import EmbDataset
from useful_functions import CollateFunctor

In [112]:
train_dataset = EmbDataset(train_df, embedding)
type(train_dataset)

		Percentage of unknown tokens: 1.16%


dataset.EmbDataset

In [113]:
emb_val_dataloader = DataLoader(train_dataset,
                              batch_size=32,
                              shuffle=True,
                              drop_last=True,
                              num_workers=1,
                              collate_fn=CollateFunctor(embedding.get_index("[PAD]"),
                                                        64)
                              )

In [114]:
for sentence, y in tqdm.tqdm(emb_val_dataloader):
    print(sentence)
    break

  0%|                                                   | 0/395 [00:04<?, ?it/s]


TypeError: Caught TypeError in DataLoader worker process 0.
Original Traceback (most recent call last):
  File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/torch/utils/data/_utils/worker.py", line 302, in _worker_loop
    data = fetcher.fetch(index)
  File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/torch/utils/data/_utils/fetch.py", line 61, in fetch
    return self.collate_fn(data)
  File "/Users/coco/Documents/Uio/IN5550/oblig/IN5550/oblig3/useful_functions.py", line 72, in __call__
    labels = torch.LongTensor(labels)
TypeError: only integer tensors of a single element can be converted to an index


In [None]:
print(len(train_df.iloc[0]["sentence"].split(" ")), len(train_df.iloc[0]["labels"].split(" ")))

train_df.iloc[3]["labels"]


In [None]:
train_df.iloc[0]

In [None]:
train_dataset[0]

In [None]:
len(train_dataset[0][0]), len(train_dataset[0][1])


In [48]:
def get_mapping_matrix_batched(offset_mapping, lengths, n_subwords: int, n_words: int):
    mapping = torch.zeros(len(lengths), n_words, n_subwords)

    for i_batch in range(len(lengths)):
        current_word, remaining_len = 0, lengths[i_batch][0]

        for i, (start, end) in enumerate(offset_mapping[i_batch]):
            if start == end:
                continue

            mapping[i_batch, current_word, i] = 1
            remaining_len -= end - start

            if remaining_len <= 0 and current_word < len(lengths[i_batch]) - 1:
                current_word += 1
                remaining_len = lengths[i_batch][current_word]

    return mapping

offsets = train_dataset[0]
#word_lengths = train_dataset[0]
#subword_ids = train_dataset[0]
#n_subwords = subword_ids.size(1)
#n_words = max(len(words) for words in word_lengths)

#offsets


In [54]:
word_lengths = 0
offsets = 0
n_subwords = 0
n_words = 0
for input_ids, attention_mask, y, offset_mapping, word_length in tqdm.tqdm(train_iter):
    offsets = offset_mapping
    word_lengths = word_length
    n_subwords = input_ids.size(1)
    n_words = max(len(words) for words in word_lengths)
    
    break

  0%|                                                  | 0/2534 [00:00<?, ?it/s]2023-03-30 14:04:07.507855: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


64 64


  0%|                                                  | 0/2534 [00:10<?, ?it/s]


In [57]:
mapping_matrix = get_mapping_matrix_batched(offsets, word_lengths, n_subwords, n_words)
mapping_matrix.shape, mapping_matrix


(torch.Size([5, 64, 64]),
 tensor([[[0., 1., 0.,  ..., 0., 0., 0.],
          [0., 0., 1.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          ...,
          [0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.]],
 
         [[0., 1., 0.,  ..., 0., 0., 0.],
          [0., 0., 1.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          ...,
          [0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.]],
 
         [[0., 1., 0.,  ..., 0., 0., 0.],
          [0., 0., 1.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          ...,
          [0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.]],
 
         [[0., 1., 0.,  ..., 0., 0., 0.],
          [0., 0., 1.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          ...,
          [0., 0., 0., 

In [72]:
path = 'data/norne-nb-in5550-train.conllu.gz'

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

data = open_and_read_path(path)
data.head()


Unnamed: 0,sentence,labels
0,Rus har en klar sosial dimensjon ; man drikker...,O O O O O O O O O O O O O O O O O O O O O O O O
1,I tillegg skal daglig leder ved Miljøtransport...,O O O O O O B-ORG I-ORG O O O O O O O O O O O ...
2,« Angående vår nærmere lovede undersøkelse på ...,O O O O O O O O O O O O O O O O O O O O O O O ...
3,- I verste fall kan det være dødelig hvis du s...,O O O O O O O O O O O O O O O B-PER O
4,At vi i det hele tatt har det er et resultat a...,O O O O O O O O O O O O O O O O O O O B-LOC O ...


In [85]:
import torch
import torch.nn as nn
import torch.nn.functional as F

from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")


class FakeBERT:
    def __init__(self, hidden_dim):
        self.hidden_dim = hidden_dim
    
    def __call__(self, ids):
        return torch.randn(ids.shape + (self.hidden_dim,))


bert = FakeBERT(hidden_dim=4)

sentences = ["Let's start tokenizing", "What a VERYVERYVERY pretty sentence", "three"]
data = pd.DataFrame()
data["sentence"] = sentences
data["labels"] = ["O O O", "O O O O O", "O"]
train_dataset = NotebookTokenDataset(data, tokenizer)

train_iter = DataLoader(train_dataset,
                        batch_size=3,
                        shuffle=False,
                        num_workers=1
                        )
input_ids = 0
word_lengths = 0
offsets = 0
n_subwords = 0
n_words = 0
for input_ids, attention_mask, y, offset_mapping, word_length in tqdm.tqdm(train_iter):
    input_ids = input_ids
    offsets = offset_mapping
    word_lengths = word_length
    n_subwords = input_ids.size(1)
    n_words = max(len(words) for words in word_lengths)
    print(n_subwords, n_words)
    break


  0%|                                                     | 0/1 [00:00<?, ?it/s]Traceback (most recent call last):
  File "<string>", line 1, in <module>
  File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/multiprocessing/spawn.py", line 116, in spawn_main
    exitcode = _main(fd, parent_sentinel)
  File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/multiprocessing/spawn.py", line 126, in _main
    self = reduction.pickle.load(from_parent)
  File "/Users/coco/Documents/Uio/IN5550/oblig/IN5550/oblig3/dataset.py", line 9, in <module>
    from useful_functions import encoder
ImportError: cannot import name 'encoder' from 'useful_functions' (/Users/coco/Documents/Uio/IN5550/oblig/IN5550/oblig3/useful_functions.py)
  0%|                                                     | 0/1 [00:04<?, ?it/s]


RuntimeError: DataLoader worker (pid(s) 34149) exited unexpectedly

In [80]:
bert = FakeBERT(hidden_dim=4)
contextualized_embeddings = bert(input_ids)

print(contextualized_embeddings.shape)

mapping_matrix = get_mapping_matrix_batched(offsets, word_lengths, n_subwords, n_words)
(mapping_matrix.shape)


torch.Size([3, 15, 4])


torch.Size([3, 64, 15])

In [84]:
summ = torch.einsum('bij, bjk -> bik', mapping_matrix, contextualized_embeddings)
res = summ / torch.clamp(torch.sum(mapping_matrix, dim = 2, keepdim = True), 1)
res.shape, res


(torch.Size([3, 64, 4]),
 tensor([[[ 0.1737,  0.4180,  0.6225,  0.6529],
          [-0.0260,  1.3363,  0.3760, -2.1045],
          [ 0.2998, -1.9301, -0.9482,  0.9698],
          [ 0.0394,  1.5005,  0.5351,  1.5890],
          [-3.0134, -0.4731, -0.0977,  0.0247],
          [ 0.0339,  1.6950, -0.6612,  0.0854],
          [ 0.0000,  0.0000,  0.0000,  0.0000],
          [ 0.0000,  0.0000,  0.0000,  0.0000],
          [ 0.0000,  0.0000,  0.0000,  0.0000],
          [ 0.0000,  0.0000,  0.0000,  0.0000],
          [ 0.0000,  0.0000,  0.0000,  0.0000],
          [ 0.0000,  0.0000,  0.0000,  0.0000],
          [ 0.0000,  0.0000,  0.0000,  0.0000],
          [ 0.0000,  0.0000,  0.0000,  0.0000],
          [ 0.0000,  0.0000,  0.0000,  0.0000],
          [ 0.0000,  0.0000,  0.0000,  0.0000],
          [ 0.0000,  0.0000,  0.0000,  0.0000],
          [ 0.0000,  0.0000,  0.0000,  0.0000],
          [ 0.0000,  0.0000,  0.0000,  0.0000],
          [ 0.0000,  0.0000,  0.0000,  0.0000],
          [ 0.0

In [59]:
encoding = tokenizer(sentences, padding=True, return_offsets_mapping=True)
subword_ids = torch.tensor(encoding.input_ids)
offsets = encoding.offset_mapping

bert = FakeBERT(hidden_dim=4)
contextualized_embeddings = bert(subword_ids)

contextualized_embeddings.shape

torch.Size([3, 15, 4])

In [7]:
class TokenDataset(Dataset):
    def __init__(self, data, tokenizer, label_vocab = None):
        
        self.max_length = 64
        
        self.train_texts = data["sentence"].to_list()
        self.labels = list(data["labels"])
        
        self.encoding = tokenizer(self.train_texts,
                                 padding = True,
                                 truncation = True,
                                 return_offsets_mapping = True,
                                 max_length = self.max_length)
        
        self.input_ids = torch.tensor(self.encoding.input_ids)
        self.attention_mask = torch.tensor(self.encoding["attention_mask"])
        
        self.offsets = self.encoding.offset_mapping
        self.word_lengths = [[len(word) for word in sentence.split()] for sentence in self.train_texts]

        self.label_vocab = label_vocab if label_vocab is not None else list(sorted(set(" ".join(self.labels).split(" "))))
        self.num_labels = len(self.label_vocab)
        self.label_indexer = {i: n for n, i in enumerate(self.label_vocab)}
        print("\nLabel dictionary:", self.label_indexer)
        
    def __getitem__(self, index):
        current_input_ids = self.input_ids[index]
        current_attention_mask = self.attention_mask[index]

        input_ids = torch.LongTensor(current_input_ids)
        attention_mask = torch.LongTensor(current_attention_mask)

        
        def pad(l, content, width):
            l.extend([content] * (width - len(l)))
            return l
            
        current_labels = self.labels[index]
        value = self.label_indexer["O"]
        labels = []
        for label in current_labels.split(" "):
            labels.append(self.label_indexer[label])
        labels = pad(labels, value, self.max_length)
        
        y = torch.LongTensor(labels)
        
        offset_mapping = self.offsets[index]
        word_length = self.word_lengths[index]
        word_length = pad(word_length, 0, self.max_length)
        
        #mapping_matrix_info = {"offset_mapping" : offset_mapping,"word_length" : word_length}
        
        
        return input_ids, attention_mask, y, offset_mapping, word_length

    def __len__(self):
        return len(self.train_texts)
    

In [18]:
off = train_dataset[0][3]
torch.LongTensor(off)

tensor([[ 0,  0],
        [ 0,  9],
        [10, 16],
        [17, 22],
        [23, 25],
        [26, 27],
        [28, 31],
        [32, 35],
        [35, 37],
        [37, 38],
        [38, 42],
        [43, 45],
        [46, 48],
        [48, 51],
        [52, 55],
        [55, 57],
        [57, 58],
        [59, 61],
        [61, 62],
        [63, 64],
        [64, 65],
        [65, 67],
        [68, 70],
        [71, 74],
        [75, 76],
        [76, 77],
        [78, 81],
        [82, 83],
        [ 0,  0],
        [ 0,  0],
        [ 0,  0],
        [ 0,  0],
        [ 0,  0],
        [ 0,  0],
        [ 0,  0],
        [ 0,  0],
        [ 0,  0],
        [ 0,  0],
        [ 0,  0],
        [ 0,  0],
        [ 0,  0],
        [ 0,  0],
        [ 0,  0],
        [ 0,  0],
        [ 0,  0],
        [ 0,  0],
        [ 0,  0],
        [ 0,  0],
        [ 0,  0],
        [ 0,  0],
        [ 0,  0],
        [ 0,  0],
        [ 0,  0],
        [ 0,  0],
        [ 0,  0],
        [ 

In [115]:
train_df, val_df = train_test_split(data,
                                    train_size=0.7,
                                    random_state=5550)

tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

In [116]:
from dataset import NotebookTokenDataset

In [117]:
train_dataset = NotebookTokenDataset(train_df, tokenizer)

In [118]:
print("tokenized sentence: ", train_dataset[0][0])
print("\nattention mask: ", train_dataset[0][1])
print("\nlabels: ", train_dataset[0][2])
print("\nOffsets: ", train_dataset[0][3])
print("\nWord length: ", train_dataset[0][4])

tokenized sentence:  tensor([  101,  1697, 14319,  7661, 21718,   178, 11850, 13354,  2180,   118,
         8144,  1120,  1110,  7609,  1155,  3121,  1181,  5871,  1197,   191,
        28200,  3740,  4035,  3687,   170,  1964,  3066,   119,   102,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0])

attention mask:  tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

labels:  tensor([16,  6, 14, 16, 16, 16,  0, 16, 16, 16, 16, 16, 16, 16, 16,  2, 16, 16,
        16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
        16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 1

In [120]:
print(len(train_dataset))

for i in range(5):
    print(len(train_dataset[i]))
    print(len(train_dataset[i][0]), len(train_dataset[i][1]), len(train_dataset[i][2]), len(train_dataset[i][3]), len(train_dataset[i][4]))
    print("\n")
    

12668
5
64 64 64 64 64


5
64 64 64 64 64


5
64 64 64 64 64


5
64 64 64 64 64


5
64 64 64 64 64




In [121]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

train_iter = DataLoader(train_dataset,
                        batch_size=5,
                        shuffle=False,
                        num_workers=1
                        )

In [15]:
for input_ids, attention_mask, y, offset_mapping, word_length in tqdm.tqdm(train_iter):
    print(input_ids)
    print("\n")
    print(attention_mask)
    print("\n")
    print(y)
    print("\n")
    print(offset_mapping)
    print("\n")
    print(word_length)
    break

  0%|                                                  | 0/2534 [00:00<?, ?it/s]2023-03-30 13:51:30.807441: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


tensor([[ 0,  0],
        [ 0,  9],
        [10, 16],
        [17, 22],
        [23, 25],
        [26, 27],
        [28, 31],
        [32, 35],
        [35, 37],
        [37, 38],
        [38, 42],
        [43, 45],
        [46, 48],
        [48, 51],
        [52, 55],
        [55, 57],
        [57, 58],
        [59, 61],
        [61, 62],
        [63, 64],
        [64, 65],
        [65, 67],
        [68, 70],
        [71, 74],
        [75, 76],
        [76, 77],
        [78, 81],
        [82, 83],
        [ 0,  0],
        [ 0,  0],
        [ 0,  0],
        [ 0,  0],
        [ 0,  0],
        [ 0,  0],
        [ 0,  0],
        [ 0,  0],
        [ 0,  0],
        [ 0,  0],
        [ 0,  0],
        [ 0,  0],
        [ 0,  0],
        [ 0,  0],
        [ 0,  0],
        [ 0,  0],
        [ 0,  0],
        [ 0,  0],
        [ 0,  0],
        [ 0,  0],
        [ 0,  0],
        [ 0,  0],
        [ 0,  0],
        [ 0,  0],
        [ 0,  0],
        [ 0,  0],
        [ 0,  0],
        [ 

  0%|                                                  | 0/2534 [00:13<?, ?it/s]


In [16]:
for input_ids, attention_mask, y, offset_mapping, word_length in tqdm.tqdm(train_iter):
    print(word_length)
    break

  0%|                                                  | 0/2534 [00:00<?, ?it/s]2023-03-30 13:51:44.654267: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


tensor([[ 9,  6,  5,  2,  1,  3, 10,  2,  5,  6,  3,  4,  2,  3,  2,  3,  1,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0],
        [ 2,  3,  6,  5,  2,  5, 10,  3,  1,  5,  6,  1,  3,  3,  4,  3,  4, 11,
          1,  3,  3,  1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0],
        [ 4, 13,  3,  6,  2,  1,  6,  8,  6,  2,  1,  6,  9,  6,  2,  1,  3,  9,
          5,  6,  2,  1,  6,  4,  6,  2,  2,  7,  8,  6,  2,  1,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0],
        [ 1,  1,  5,  3,  2,  5,  6,  6,  2,  8,  5,  6,  1,  3,  3,  2,  4,  7,
          1,  6,  1,  0,  0,  0,  0,  0,

  0%|                                                  | 0/2534 [00:11<?, ?it/s]


In [17]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

for input_ids, attention_mask, y, offset_mapping, word_length in tqdm.tqdm(train_iter):
    #print(offset_mapping)
    print(len(input_ids[0]), len(attention_mask[0]), len(y[0]), len(offset_mapping[0]), len(word_length[0]))
    print("\t\t\tLengths are the same:", len(input_ids[0]) == len(attention_mask[0]) == len(y[0]) == len(offset_mapping[0]) == len(word_length[0]))
    print("\t\t\tBatch size is correct:", len(input_ids) == 5)
    break

  0%|                                                  | 0/2534 [00:00<?, ?it/s]2023-03-30 13:51:56.250304: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


64 64 64 64 64
			Lengths are the same: True
			Batch size is correct: True


  0%|                                                  | 0/2534 [00:11<?, ?it/s]


In [30]:
#import os
#os.environ["TOKENIZERS_PARALLELISM"] = "false"

next(enumerate(train_iter))

Traceback (most recent call last):
  File "<string>", line 1, in <module>
  File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/multiprocessing/spawn.py", line 116, in spawn_main
    exitcode = _main(fd, parent_sentinel)
  File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/multiprocessing/spawn.py", line 126, in _main
    self = reduction.pickle.load(from_parent)
AttributeError: Can't get attribute 'TokenDataset' on <module '__main__' (built-in)>


KeyboardInterrupt: 

In [33]:
len(train_df.iloc[0]["sentence"].split(" "))

17

In [40]:
len(train_dataset[1]), train_dataset[1]


(4,
 (tensor([  101, 12786, 24181,  1179, 16412,  1162,  3084,  2083,  3084,   188,
          18974,  1116,  3697,  6834, 18408,  1111,   251,  2080,  1162, 27629,
          21270,  1424,   117,  1441,  1260,  1204,  1119,  1513, 24181,  1179,
            180, 12148, 11769,  3491,  1818, 14554,  1116,   178,  3084,  1204,
           1137,  1181,   131,   102,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0]),
  tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
          1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0,
          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]),
  tensor([16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
          16, 16, 16, 16]),
  {'offset_mapping': [(0, 0),
    (0, 2),
    (3, 5),
    (5, 6),
    (7, 12),
    (12, 13),
    (14, 16),
    (16, 19),
    (20, 22),
    (23

In [588]:
len(train_dataset[0][0]), len(train_dataset[0][1]), sum(train_dataset[0][1] == 1), len(train_dataset[0][2]), len(train_dataset[0][3]["word_length"])



(64, 64, tensor(29), 17, 17)

In [None]:
list(sorted(set(" ".join(text_labels).split(" "))))

In [24]:
train_df, val_df = train_test_split(data,
                                    train_size=0.7,
                                    random_state=5550)

tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")


train_texts = train_df["sentence"].to_list()
text_labels = train_df["labels"].to_list()
text_labels = list(sorted(set(" ".join(text_labels).split(" "))))
num_classes = len(text_labels)

encoding = tokenizer(train_texts,
                     padding = True,
                     truncation = True,
                     return_offsets_mapping = True,
                     max_length = 64)

subword_ids = torch.tensor(encoding.input_ids)
offsets = torch.tensor(encoding.offset_mapping)
attention_mask = torch.tensor(encoding.attention_mask)

word_lengths = [[len(word) for word in sentence.split()] for sentence in train_texts]
n_subwords = subword_ids.size(1)
n_words = max(len(words) for words in word_lengths)


In [25]:
from torch.utils import data
label_vocab = None

train_texts = train_df["sentence"].to_list()
labels = list(train_df["labels"])

encoding = tokenizer(train_texts,
                         padding = True,
                         truncation = True,
                         return_offsets_mapping = True,
                         max_length = 64)

input_ids = torch.tensor(encoding.input_ids)
attention_mask = torch.tensor(encoding["attention_mask"])

offsets = encoding.offset_mapping
word_lengths = [[len(word) for word in sentence.split()] for sentence in train_texts]

label_vocab = label_vocab if label_vocab is not None else list(sorted(set(" ".join(labels).split(" "))))
num_labels = len(label_vocab)
label_indexer = {i: n for n, i in enumerate(label_vocab)}
print("\nLabel dictionary:", label_indexer)


new_labels = []
for label in labels:
    new_label_row = []
    for lab in label.split(" "):
        new_label_row.append(label_indexer[lab])
    new_labels.append(torch.LongTensor(new_label_row))
    

y = new_labels



Label dictionary: {'B-DRV': 0, 'B-EVT': 1, 'B-GPE_LOC': 2, 'B-GPE_ORG': 3, 'B-LOC': 4, 'B-ORG': 5, 'B-PER': 6, 'B-PROD': 7, 'I-DRV': 8, 'I-EVT': 9, 'I-GPE_LOC': 10, 'I-GPE_ORG': 11, 'I-LOC': 12, 'I-ORG': 13, 'I-PER': 14, 'I-PROD': 15, 'O': 16}


In [89]:
new_labels

[tensor([16,  6, 14, 16, 16, 16,  0, 16, 16, 16, 16, 16, 16, 16, 16,  2, 16]),
 tensor([16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
         16, 16, 16, 16]),
 tensor([ 6, 14, 16, 16, 16, 16,  6, 14, 16, 16, 16,  6, 14, 16, 16, 16,  6, 14,
         14, 16, 16, 16,  6, 14, 16, 16, 16,  6, 14, 16, 16, 16]),
 tensor([16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
         16, 16, 16]),
 tensor([16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
         16,  6, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16]),
 tensor([16, 16, 16, 16]),
 tensor([16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16]),
 tensor([16, 16, 16, 16, 16, 16, 16, 16]),
 tensor([16, 16, 16, 16, 16, 16, 16, 16, 16, 16]),
 tensor([16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16]),
 tensor([16, 16, 16, 16, 16, 16, 16, 16, 16, 16]),
 tensor([16,  2, 16,  6, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
         16, 16]),
 tensor([16, 16, 16, 16

In [29]:
input_ids
attention_mask
offsets = torch.LongTensor(offsets)
word_lengths = torch.LongTensor(word_lengths)
y = torch.LongTensor(y)

TypeError: only integer tensors of a single element can be converted to an index

In [None]:
def pad(l, content, width):
    l.extend([content] * (width - len(l)))
    return l



In [26]:
train_dataset = data.TensorDataset(input_ids,
                                   attention_mask,
                                   offsets,
                                   word_lengths,
                                   y)





AttributeError: 'list' object has no attribute 'size'

In [27]:
type(input_ids), type(attention_mask), type(offsets), type(word_lengths), type(y)


(torch.Tensor, torch.Tensor, list, list, list)

In [78]:
len(input_ids), len(attention_mask), len(offsets), len(word_lengths), len(y)


(12668, 12668, 12668, 12668, 12668)

In [81]:
for i in range(5):
    print(len(input_ids[i]), len(attention_mask[i]), len(offsets[i]), len(word_lengths[i]), len(y[i]))


64 64 64 17 17
64 64 64 22 22
64 64 64 32 32
64 64 64 21 21
64 64 64 30 30


In [None]:
tokens = tokenizer(train_texts,
          return_tensors="pt",
          padding=True,
          truncation=True,
          max_length=64)

In [518]:
tokens

{'input_ids': tensor([[  101,  1697, 14319,  ...,     0,     0,     0],
        [  101, 12786, 24181,  ...,     0,     0,     0],
        [  101,   146,  4121,  ...,  1766,  7174,   102],
        ...,
        [  101, 14177,  1116,  ...,     0,     0,     0],
        [  101,   156, 17945,  ...,     0,     0,     0],
        [  101, 19569, 24577,  ...,   171, 19921,   102]]), 'token_type_ids': tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 1, 1, 1],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 1, 1, 1]])}

In [513]:
sent = train_texts[0]
ids = tokenizer(sent).input_ids
print(sent)
print([tokenizer.convert_ids_to_tokens(i) for i in ids])

President Barack Obama sa i sin Kairo-tale at islam alltid har vært en del av USA .
['[CLS]', 'President', 'Barack', 'Obama', 'sa', 'i', 'sin', 'Kai', '##ro', '-', 'tale', 'at', 'is', '##lam', 'all', '##ti', '##d', 'ha', '##r', 'v', '##æ', '##rt', 'en', 'del', 'a', '##v', 'USA', '.', '[SEP]']


In [512]:
offsets = tokenizer(sent, return_offsets_mapping=True).offset_mapping
len(offsets), offsets

(29,
 [(0, 0),
  (0, 9),
  (10, 16),
  (17, 22),
  (23, 25),
  (26, 27),
  (28, 31),
  (32, 35),
  (35, 37),
  (37, 38),
  (38, 42),
  (43, 45),
  (46, 48),
  (48, 51),
  (52, 55),
  (55, 57),
  (57, 58),
  (59, 61),
  (61, 62),
  (63, 64),
  (64, 65),
  (65, 67),
  (68, 70),
  (71, 74),
  (75, 76),
  (76, 77),
  (78, 81),
  (82, 83),
  (0, 0)])

In [516]:
word_lengths = [len(word) for word in sent.split()]
len(word_lengths), word_lengths

(17, [9, 6, 5, 2, 1, 3, 10, 2, 5, 6, 3, 4, 2, 3, 2, 3, 1])

In [504]:
train_texts[0]

'President Barack Obama sa i sin Kairo-tale at islam alltid har vært en del av USA .'

In [503]:
tokens["input_ids"][0]

tensor([  101,  1697, 14319,  7661, 21718,   178, 11850, 13354,  2180,   118,
         8144,  1120,  1110,  7609,  1155,  3121,  1181,  5871,  1197,   191,
        28200,  3740,  4035,  3687,   170,  1964,  3066,   119,   102,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0])