In [1]:
from transformers import AutoTokenizer
import transformers

In [3]:
import data_handler_cross_NER
import numpy as np
import os

In [4]:
pretrained_model_relying_on = 'deepset/roberta-base-squad2'

In [5]:
tokenizer = AutoTokenizer.from_pretrained(pretrained_model_relying_on)
assert isinstance(tokenizer, transformers.PreTrainedTokenizerFast)

In [6]:
MODEL_CONTEXT_WINDOW = tokenizer.model_max_length
print("Model: {} has context window size of {}".format(pretrained_model_relying_on, MODEL_CONTEXT_WINDOW))
MAX_SEQ_LENGTH = 384  # question + context + special 64
assert MAX_SEQ_LENGTH <= MODEL_CONTEXT_WINDOW, ("MAX SEQ LENGTH must be smallerEqual than model context window")
DOC_STRIDE = 128  # overlap between 2 consecutive passages from same document, 16
MAX_QUERY_LENGTH = 32
assert DOC_STRIDE < (MAX_SEQ_LENGTH - MAX_QUERY_LENGTH), ("DOC_STRIDE must be smaller than MAX_SEQ_LENGTH - MAX_QUERY_LENGTH, otherwise parts of the doc will be skipped")

Model: deepset/roberta-base-squad2 has context window size of 512


In [7]:
path_to_cross_NER_datasets = "../../datasets/CrossNER/ner_data"
dataset_name = "music"
path_to_questions = f"./cross_ner_questions/{dataset_name}.txt"

In [8]:
questions = data_handler_cross_NER.load_questions_from_txt(path_to_questions)
print(questions)

{'album': 'A music album is a collection of audio recordings, typically songs, that are released together as a single package. Which are albums in the document ?', 'award': 'A music award is like a prize given to musicians or groups to acknowledge their achievements in areas like singing, writing music, and producing. Which are awards in the document ?', 'band': 'A music band is a group of musicians who play instruments and/or sing together. Which are bands in the document ?', 'country': 'A country is a distinct geographical area with its own government, borders, and population. Which are countries in the document ?', 'event': 'An event is a planned and organized occasion or happening, often with a specific purpose or goal, where people gather to participate or observe. Which are events in the document ?', 'location': 'A location is a specific place or position, usually defined by its geographical coordinates, where something exists or events occur. Which are locations in the document 

In [9]:
for neTag, question in questions.items():
    tokenized_examples = tokenizer(
        question,
        truncation="only_second",
        max_length=MAX_SEQ_LENGTH,
        stride=DOC_STRIDE,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )
    print(neTag)
    input_ids = tokenized_examples['input_ids']
    input_ids = np.where(np.array(input_ids) > 1, input_ids, 0)
    print(np.count_nonzero(input_ids))

album
30
award
35
band
25
country
25
event
35
location
31
misc
39
musicalartist
26
musicalinstrument
21
musicgenre
36
organisation
34
person
20
song
28


In [9]:
dataset_dict = data_handler_cross_NER.build_dataset_from_txt(os.path.join(path_to_cross_NER_datasets, dataset_name))

In [10]:
n_tokens_per_sample = []
for sample in dataset_dict["train"]:
    print(sample["tokens"])
    tokenized_examples = tokenizer(
        sample["tokens"],
        truncation="only_second",
        max_length=MAX_SEQ_LENGTH,
        stride=DOC_STRIDE,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )
    input_ids = tokenized_examples['input_ids']
    input_ids = np.where(np.array(input_ids) > 1, input_ids, 0)
    print(np.count_nonzero(input_ids))
    n_tokens_per_sample.append(np.count_nonzero(input_ids))

['In', '2003', ',', 'the', 'Stade', 'de', 'France', 'was', 'the', 'primary', 'site', 'of', 'the', '2003', 'World', 'Championships', 'in', 'Athletics', '.']
44
['In', 'addition', 'to', 'relentless', 'touring', 'in', 'the', 'U.S.', 'and', 'Canada', ',', 'PUSA', 'made', 'multiple', 'tours', 'of', 'Europe', ',', 'Australia', ',', 'New', 'Zealand', 'and', 'Japan', '.']
61
['Barney', 'Bubbles', 'directed', 'several', 'videos', ',', 'including', 'the', 'Specials', "'", 'Ghost', 'Town', ',', 'Squeeze', "'", 's', 'Is', 'That', 'Love', 'and', 'Tempted', ',', 'Elvis', 'Costello', "'", 's', 'Clubland', 'and', 'New', 'Lace', 'Sleeves', ',', 'and', 'Fun', 'Boy', 'Three', "'", 's', 'The', 'Lunatics', '(', 'Have', 'Taken', 'Over', 'the', 'Asylum', ')', '.']
115
['Since', 'then', 'there', 'has', 'been', 'a', 'renaissance', 'in', 'Sacred', 'Harp', 'singing', ',', 'with', 'annual', 'conventions', 'popping', 'up', 'in', 'United', 'States', 'and', 'in', 'a', 'number', 'of', 'European', 'countries', 'recent

In [11]:
max_n_tokens_per_sample = np.max(np.array(n_tokens_per_sample))
print(max_n_tokens_per_sample)
average_n_tokens_per_sample = np.average(np.array(n_tokens_per_sample))
print(average_n_tokens_per_sample)

199
93.59


In [12]:
dataset_QA_format = data_handler_cross_NER.build_dataset_QA_format(dataset_dict, path_to_questions)
print(dataset_QA_format)

{'album': 'A music album is a collection of audio recordings, typically songs, that are released together as a single package. Which are albums in the document ?', 'award': 'A music award is like a prize given to musicians or groups to acknowledge their achievements in areas like singing, writing music, and producing. Which are awards in the document ?', 'band': 'A music band is a group of musicians who play instruments and/or sing together. Which are bands in the document ?', 'country': 'A country is a distinct geographical area with its own government, borders, and population. Which are countries in the document ?', 'event': 'An event is a planned and organized occasion or happening, often with a specific purpose or goal, where people gather to participate or observe. Which are events in the document ?', 'location': 'A location is a specific place or position, usually defined by its geographical coordinates, where something exists or events occur. Which are locations in the document 

In [24]:
n_tokens_for_this_answer = []
for sample in dataset_QA_format["train"]:
    sample_answers = sample["answers"]
    for answer in sample_answers["text"]:
        print(answer)
        tokenized_examples = tokenizer(
            answer,
            truncation="only_second",
            max_length=MAX_SEQ_LENGTH,
            stride=DOC_STRIDE,
            return_overflowing_tokens=True,
            return_offsets_mapping=True,
            padding="max_length",
        )
        input_ids = tokenized_examples['input_ids']
        #print(input_ids)
        input_ids = np.where(np.array(input_ids) > 1, input_ids, 0)
        print(np.count_nonzero(input_ids))
        n_tokens_for_this_answer.append(np.count_nonzero(input_ids))

max_n_tokens_per_answer = np.max(np.array(n_tokens_for_this_answer))
print("max_n_tokens_per_answer", max_n_tokens_per_answer)
average_n_tokens_per_answer = np.average(np.array(n_tokens_for_this_answer))
print("average_n_tokens_per_answer", average_n_tokens_per_answer)

2003 World Championships in Athletics
6
Stade de France
5
PUSA
3
U.S.
5
Canada
2
Australia
2
New Zealand
3
Japan
2
Europe
2
the Specials
4
Ghost Town
3
Squeeze
4
Fun Boy Three
4
Elvis Costello
5
Barney Bubbles
5
Is That Love
4
Tempted
4
Clubland
3
New Lace Sleeves
7
The Lunatics ( Have Taken Over the Asylum )
11
United States
3
United Kingdom
3
Germany
2
Ireland
2
Poland
3
Australia
2
European
2
Sacred Harp
5
Backstreet Boys
4
Spice Girls
4
Madonna
3
Debbie Gibson
4
Tiffany
4
Sun Ra Visits Planet Earth
7
Interstellar Low Ways
5
Super-Sonic Jazz
6
We Travel the Space Ways
6
The Nubians of Plutonia
9
Jazz In Silhouette
7
Sun Ra
3
Tea for the Tillerman
6
Teaser and the Firecat
7
US
2
Stevens
3
Recording Industry Association of America
7
BBC News
3
Top New Male Vocalist
7
Vocal Event of the Year
7
Billboard
3
Academy of Country Music
7
Country Music Association
4
Power in Black
4
Exodus
3
Testament
3
Bay Area
3
Verni
4
Skates
3
Ellsworth
3
Gustafson
5
thrash metal
5
Rock and roll
4
Punk ro

# other tests

In [25]:
from datasets import load_dataset, Dataset, DatasetDict, concatenate_datasets
from transformers import AutoTokenizer
import transformers
import numpy as np
import pickle
import torch
import sys
import os

# my libraries
import data_handler_cross_NER  # to load dataset in BIO format and convert it into QA format for NER


def rename_ids(examples):
    examples['doc_question_pairID'] = strIDs_to_floatIDs[examples['doc_question_pairID'].split(':')[0]][examples['doc_question_pairID']]
    return examples


def prepare_features_for_training(examples):
    # concatenate the question;document_context and tokenize (adding also RoBERTa special tokens)
    # overflows will be automatically treated by using a sliding window approach
    # questions are concatenated to the left of the document_context
    """
    tokenized_examples = tokenizer(
        examples["question"],
        examples["document_context"],
        truncation="only_second",
        max_length=MAX_SEQ_LENGTH,
        stride=DOC_STRIDE,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )
    """
    # setting padding=longest, padding to the longest sequence in the batch
    tokenized_examples = tokenizer(
        examples["question"],
        examples["document_context"],
        truncation="only_second",
        max_length=MAX_SEQ_LENGTH,
        stride=DOC_STRIDE,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding=False,  # not padding here
    )

    # Since one document might produce several passages if it has a long context, we need a map from passages to its corresponding doc-question sample
    sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")

    # The offset mappings will give us a map from token to character positions in the original context.
    # This will help us compute the start_positions and end_positions.
    offset_mapping = tokenized_examples.pop("offset_mapping")

    # in multispan EQA for each sample we may have multiple start_positions & end_positions
    # therefore will not be a single int for each sample but a List[int]
    # we cannot have variable length Lists when padding, nor we want to impose a maximum number of answers per passage
    # we encode spans as multi-1-hot-vectors
    tokenized_examples["start_positions"] = [np.zeros(len(offset_mapping[i]), dtype=int) for i in range(len(offset_mapping))]
    tokenized_examples["end_positions"] = [np.zeros(len(offset_mapping[i]), dtype=int) for i in range(len(offset_mapping))]

    # which are passage tokens and which are question/special tokens
    tokenized_examples["sequence_ids"] = [[] for i in range(len(offset_mapping))]

    # in passage_id we save the doc_question_pairID that generated it to later collect back passages answers to doc level
    tokenized_examples["passage_id"] = []

    # new offset_mappings with [-1, -1] if not passage token (added to pad to MAX_SEQ_LENGTH)
    tokenized_examples["offset_mapping"] = [[] for i in range(len(offset_mapping))]

    for i, offsets in enumerate(offset_mapping):
        # giving to passageID the ID of the doc-question pair that generated it
        sample_index = sample_mapping[i]
        tokenized_examples["passage_id"].append(examples["doc_question_pairID"][sample_index])

        # Labeling impossible answers with the index of the CLS token
        input_ids = tokenized_examples["input_ids"][i]
        cls_index = input_ids.index(tokenizer.cls_token_id)

        # Grab the sequence corresponding to that example (to know what is the context and what is the question).
        sequence_ids = tokenized_examples.sequence_ids(i)  # i is batch index
        # creating mask with 1 marking valid CLS and passage tokens
        sequence_ids = np.where(np.array(sequence_ids) == 1, sequence_ids, 0)  # 0 if 0 or None (special tokens and padding tokens to MAX_SEQ_LENGTH)
        sequence_ids[0] = 1  # CLS token will be used for not_answerable questions then its token must be treated as passage token
        tokenized_examples["sequence_ids"][i] = sequence_ids
        tokenized_examples["offset_mapping"][i] = [
            (o if sequence_ids[k] == 1 else (-1, -1))
            for k, o in enumerate(offset_mapping[i])
        ]

        sample_index = sample_mapping[i]
        answers = examples["answers"][sample_index]
        # If no answers at document level are given, set the cls_index as answer.
        if len(answers["answer_start"]) == 0:
            tokenized_examples["start_positions"][i][cls_index] = 1
            tokenized_examples["end_positions"][i][cls_index] = 1
        else:
            atLeastOneAnswer = False
            for answer_start_char, answer_text in zip(answers["answer_start"], answers["text"]):
                # sequence_ids hides the sequence_ids modified to act as mask for question tokens and passage tokens
                # retrieve not modified back
                sequence_ids = tokenized_examples.sequence_ids(i)

                # Start/end character index of the answer in the text.
                start_char = answer_start_char
                end_char = start_char + len(answer_text)

                # moving start token index to the start of the passage
                token_start_index = 0
                while sequence_ids[token_start_index] != 1:
                    token_start_index += 1

                # moving end token index to the end of the passage
                token_end_index = len(input_ids) - 1
                while sequence_ids[token_end_index] != 1:
                    token_end_index -= 1

                # Detect if the answer is out of the span (in which case this feature is labeled with the CLS index).
                if not (offsets[token_start_index][0] <= start_char and offsets[token_end_index][1] >= end_char):
                    tokenized_examples["start_positions"][i][cls_index] = 1
                    tokenized_examples["end_positions"][i][cls_index] = 1
                else:
                    # Otherwise move the token_start_index and token_end_index to the two ends of the answer.
                    while token_start_index < len(offsets) and offsets[token_start_index][0] <= start_char:
                        token_start_index += 1
                    tokenized_examples["start_positions"][i][token_start_index - 1] = 1

                    while offsets[token_end_index][1] >= end_char:
                        token_end_index -= 1
                    tokenized_examples["end_positions"][i][token_end_index + 1] = 1

                    atLeastOneAnswer = True  # there is at least one answer in this passage

            # it may be that some doc level answer was not in the passage and triggered the CLS position to be 1
            # we set it back to 0
            if atLeastOneAnswer:
                tokenized_examples["start_positions"][i][cls_index] = 0
                tokenized_examples["end_positions"][i][cls_index] = 0

    return tokenized_examples

In [26]:

pretrained_model_relying_on = 'deepset/roberta-base-squad2'

# loading tokenizer
tokenizer = AutoTokenizer.from_pretrained(pretrained_model_relying_on)
assert isinstance(tokenizer, transformers.PreTrainedTokenizerFast)

MODEL_CONTEXT_WINDOW = tokenizer.model_max_length
print("Model: {} has context window size of {}".format(pretrained_model_relying_on, MODEL_CONTEXT_WINDOW))

MAX_SEQ_LENGTH = 256  # question + context + special # 128 for conll2003, 256 for others
assert MAX_SEQ_LENGTH <= MODEL_CONTEXT_WINDOW, ("MAX SEQ LENGTH must be smallerEqual than model context window")
MAX_QUERY_LENGTH = 48
DOC_STRIDE = 64  # overlap between 2 consecutive passages from same document, 32 for conll2003
assert DOC_STRIDE < (MAX_SEQ_LENGTH - MAX_QUERY_LENGTH), ("DOC_STRIDE must be smaller, otherwise parts of the doc will be skipped")

# name of the dataset to convert to QA and tokenize
path_to_cross_NER_datasets = "../../datasets/CrossNER/ner_data"
#path_to_cross_NER_datasets = "./datasets/CrossNER/ner_data"
dataset_name = "music"

# loading dataset in BIO format
dataset_BIO_format = data_handler_cross_NER.build_dataset_from_txt(os.path.join(path_to_cross_NER_datasets, dataset_name))

# converting to QA for NER format (building document;question;gold_answers dataset)
#path_to_questions = os.path.join("./cross_ner_questions_simpler/", dataset_name + ".txt")
path_to_questions = os.path.join("./cross_ner_questions/", dataset_name + ".txt")
dataset_QA_format = data_handler_cross_NER.build_dataset_QA_format(dataset_BIO_format, path_to_questions)

print("Dataset converted to QA format: ")
print(dataset_QA_format)

print("some samples:")
print(dataset_QA_format["train"][0])
print(dataset_QA_format["train"][1])
print(dataset_QA_format["train"][23])

dataset_name = dataset_QA_format.pop("dataset_name")

# Shuffling question-document samples to not have all questions for a document grouped
dataset_QA_format = dataset_QA_format.shuffle()

# dict for str <--> float ID renaming
strIDs_to_floatIDs = {splitName: {} for splitName in dataset_QA_format.keys()}
i = 0.5
for splitName in dataset_QA_format.keys():
    for sample in dataset_QA_format[splitName]:
        strIDs_to_floatIDs[splitName][sample['doc_question_pairID']] = i
        i += 1
# inverting dict
floatIDs_to_strIDs = {splitName: {} for splitName in strIDs_to_floatIDs.keys()}
for splitName in strIDs_to_floatIDs.keys():
    floatIDs_to_strIDs[splitName] = {v: k for k, v in strIDs_to_floatIDs[splitName].items()}

# RENAMING str IDs to Float IDs
dataset_QA_format["train"] = dataset_QA_format["train"].map(rename_ids, batched=False)
dataset_QA_format["validation"] = dataset_QA_format["validation"].map(rename_ids, batched=False)
dataset_QA_format["test"] = dataset_QA_format["test"].map(rename_ids, batched=False)

# saving dataset_doc_quest_ans
# dirToSaveTo = "../../datasets/CrossNER_QA_format_simpler"
dirToSaveTo = "./datasets/CrossNER_QA_format"
os.makedirs(os.path.join(dirToSaveTo, dataset_name))
with open(os.path.join(dirToSaveTo, dataset_name, 'dataset_doc_quest_ans.pickle'), 'wb') as handle:
    pickle.dump(dataset_QA_format, handle, protocol=pickle.HIGHEST_PROTOCOL)

# building tokenized datasets
tokenized_datasets = dataset_QA_format.map(prepare_features_for_training, batched=True, remove_columns=dataset_QA_format["train"].column_names)
print(tokenized_datasets)

loading configuration file config.json from cache at /Users/andrew/.cache/huggingface/hub/models--deepset--roberta-base-squad2/snapshots/e84d19c1ab20d7a5c15407f6954cef5c25d7a261/config.json
Model config RobertaConfig {
  "_name_or_path": "deepset/roberta-base-squad2",
  "architectures": [
    "RobertaForQuestionAnswering"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "language": "english",
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "name": "Roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.24.0",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 50265
}

loading file vocab.json from cach

Model: deepset/roberta-base-squad2 has context window size of 512
{'album': 'A music album is a collection of audio recordings, typically songs, that are released together as a single package. Which are albums in the document ?', 'award': 'A music award is like a prize given to musicians or groups to acknowledge their achievements in areas like singing, writing music, and producing. Which are awards in the document ?', 'band': 'A music band is a group of musicians who play instruments and/or sing together. Which are bands in the document ?', 'country': 'A country is a distinct geographical area with its own government, borders, and population. Which are countries in the document ?', 'event': 'An event is a planned and organized occasion or happening, often with a specific purpose or goal, where people gather to participate or observe. Which are events in the document ?', 'location': 'A location is a specific place or position, usually defined by its geographical coordinates, where some

  0%|          | 0/1300 [00:00<?, ?ex/s]

  0%|          | 0/4940 [00:00<?, ?ex/s]

  0%|          | 0/6045 [00:00<?, ?ex/s]

  0%|          | 0/2 [00:00<?, ?ba/s]

KeyboardInterrupt: 