In [1]:
import pandas as pd
import pickle
import numpy as np

import os

import torch
from torch.utils.data import Subset

import transformers
from transformers import AutoTokenizer

In [2]:
model_name='dmis-lab/biobert-v1.1'
tokenizer = AutoTokenizer.from_pretrained(model_name)  # učitavanje konketnog tokenizatora

In [2]:
def load_from_pickle(pickle_name):
    with open(pickle_name, 'rb') as fh:
        unpickled_object = pickle.load(fh)
    return unpickled_object

In [80]:
# The function takes the dataset and the name of the file in which it should be pickled.
# The third argument (when explicitly called) forms a subfolder of the BioNER class name that is "hidden"
# for zero-shot and few-shot training, which enables a transparent folder structure of the data.

# Returns nothing, just saves the pickle dataset.

def dump_to_pickle(data_set, file_name, class_name=None):
    if class_name == None:
        folders = os.path.join('Datasets')
    else:
        folders = os.path.join('Datasets', class_name)
    os.makedirs(folders, exist_ok=True)
    filename = file_name+'.pkl'
    file_path = os.path.join(folders, filename)
    outfile = open(file_path,'wb')
    pickle.dump(data_set,outfile, protocol=4)

In [4]:
# function BERT-tokenizes (WordPiece principle) input
# in our case 2 sequences ((1) explicitly given class and (2) sentence)
# and returns the BERT-tokenized output

def BERTtokenizovanje_ClassText(df):
    
    tokenized_encodings = tokenizer(df["class"].to_list(),
                                 df["text"].to_list(),
                                 truncation=True,
                                 is_split_into_words=True,
                                 add_special_tokens=True,
                                 padding='max_length',
                                 max_length=512)
    return tokenized_encodings

In [5]:
# Due to the nature of BERT-tokenization (the WordPiece approach may return the initial token as multiple tokens),
# alignment of labels is required. In addition, it was necessary to take into account that the sequence was created
# by concatenating 2 initial sequences (classes and sentences)

# In our approach, we opted for a principle where each part of a unique initial token is assigned a value
# (another approach would be to assign a value to only the first part, and to give the rest a value of -100 (corresponds to None)).

# The function takes 2 parameters (the third one is assigned due to access selection) - the loaded dataframe and the BERT-tokenized inut
# and returns output that now contains aligned labels.

def poravnanje_labela(df, tokenized_encodings, label_all_tokens = True):
    
    labels = list()
    for i, label in enumerate(df['labels']):
        word_ids = tokenized_encodings.word_ids(batch_index=i)  # Map tokens to their respective word.
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:  # Set the special tokens to -100.
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                if word_idx < len(label):
                    label_ids.append(label[word_idx])
                else:
                    label_ids.append(-100)
            else:
                label_ids.append(label[word_idx] if label_all_tokens else -100)
            previous_word_idx = word_idx
        stop = label_ids.index(-100, 2) # drugo javljanje None (-100) je na ovom indexu
        label_ids = label_ids[:1] + [1 for x in label_ids[1:stop]] + label_ids[stop:]
        labels.append(label_ids)
    return labels

In [6]:
# The class aims to create the appropriate type of tensor data required as input to the Trainer method.
# Takes a BERT-tokenized and "aligned" object and returns the dataset class.

class Preoblikuj_u_Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

In [4]:
# Call when the script and 3 (train, valid, test) dataframes are in the same folder (possibly change the .pkl names)

df_train_name = './ALL_DATA_klasa_nova_train.pkl'
df_valid_name = './ALL_DATA_klasa_nova_valid.pkl'
df_test_name = './ALL_DATA_klasa_nova_test.pkl'

In [10]:
df_train = load_from_pickle(df_train_name)
df_valid = load_from_pickle(df_valid_name)
df_test = load_from_pickle(df_test_name)

In [11]:
tokenized_encodings_train = BERTtokenizovanje_ClassText(df_train)
tokenized_encodings_valid = BERTtokenizovanje_ClassText(df_valid)
tokenized_encodings_test = BERTtokenizovanje_ClassText(df_test)

In [12]:
labels_train = poravnanje_labela(df_train, tokenized_encodings_train, label_all_tokens = True)
labels_valid = poravnanje_labela(df_valid, tokenized_encodings_valid, label_all_tokens = True)
labels_test = poravnanje_labela(df_test, tokenized_encodings_test, label_all_tokens = True)

In [None]:
dataset_train = Preoblikuj_u_Dataset(tokenized_encodings_train, labels_train)
dataset_valid = Preoblikuj_u_Dataset(tokenized_encodings_valid, labels_valid)
dataset_test = Preoblikuj_u_Dataset(tokenized_encodings_test, labels_test)

In [None]:
dump_to_pickle(dataset_train, 'dataset_train')
dump_to_pickle(dataset_valid, 'dataset_valid')
dump_to_pickle(dataset_test, 'dataset_test')