In [2]:
from transformers import AutoTokenizer
import transformers

In [3]:
tokenizer_to_use = "roberta-base"

In [4]:
MAX_SEQ_LENGTH = 512  # question + context + special tokens
DOC_STRIDE = 64  # overlap between 2 consecutive passages from same document
MAX_QUERY_LENGTH = 48  # not used, but questions must not be too long given a chosen DOC_STRIDE

In [5]:
print("\nLoading tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(tokenizer_to_use)
assert isinstance(tokenizer, transformers.PreTrainedTokenizerFast)
MODEL_CONTEXT_WINDOW = tokenizer.model_max_length
assert MAX_SEQ_LENGTH <= MODEL_CONTEXT_WINDOW, "MAX SEQ LENGTH must be smallerEqual than model context window"
print(f"MAX_SEQ_LENGTH used to chunk documents: {MAX_SEQ_LENGTH}")
assert DOC_STRIDE < (MAX_SEQ_LENGTH - MAX_QUERY_LENGTH), "DOC_STRIDE must be smaller, otherwise parts of the doc will be skipped"
print("DOC_STRIDE used: {}".format(DOC_STRIDE))


Loading tokenizer...
MAX_SEQ_LENGTH used to chunk documents: 512
DOC_STRIDE used: 64


In [6]:
prefix = "Your task is to extract the Named Entities of type PROCESS from an input TEXT. You are given a DEFINITION and some GUIDELINES.\nDEFINITION: PROCESS refers to a series of actions, changes, or functions that lead to a particular result or outcome, such as procedures, methodologies, and workflows.\nGUIDELINES: Do not label general activities or events without a clear sequence. Exercise caution with ambiguous terms like 'production' (could refer to a process, a company, or a location) and 'assembly' (could refer to a process or a gathering of people).\nTEXT:"

In [7]:
print(prefix)

Your task is to extract the Named Entities of type PROCESS from an input TEXT. You are given a DEFINITION and some GUIDELINES.
DEFINITION: PROCESS refers to a series of actions, changes, or functions that lead to a particular result or outcome, such as procedures, methodologies, and workflows.
GUIDELINES: Do not label general activities or events without a clear sequence. Exercise caution with ambiguous terms like 'production' (could refer to a process, a company, or a location) and 'assembly' (could refer to a process or a gathering of people).
TEXT:


PREFIX LENGTH in tokens = 134

In [8]:
document_context = "Muhammad Ajward Macan Markar\n\nMuhammad Ajward Macan Markar, FRCP is a Sri Lankan physician, academic. He was the first Professor of Medicine at the University of Ceylon, Peradeniya. Born to Sir Mohamed Macan Markar, he was educated at Royal College Colombo, where he played cricket for the college at the Royal-Thomian. Macan Markar went on to study medicine at the University of Ceylon, where he also represented the university at cricket. Graduating with a MBBS and winning the Gold medal for in obstetrics and gynaecology, he went on to gain MBBS and M.D. from the University of London and MRCP by 1952. He is a Fellow of the Royal College of Physicians. He was the younger brother of Ahmed Hussain Macan Markar. References\n\nCategory:Sri Lankan medical doctors\nCategory:Alumni of Royal College, Colombo\nCategory:Alumni of the University of Ceylon (Colombo)\nCategory:Alumni of the University of London\nCategory:Living people\nCategory:Fellows of the Royal College of Physicians\nCategory:Academics of the University of Ceylon (Peradeniya)\nCategory:Year of birth missing (living people)"

CONTEXT in tokens = 270

In [14]:
document_context_2 = document_context + document_context

In [26]:
tokenized = tokenizer(prefix, document_context_2, 
        truncation='only_second',  # longest_first
        max_length=MAX_SEQ_LENGTH,
        stride=DOC_STRIDE,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding=False,  # not padding here
    )

In [28]:
print(len(tokenized['input_ids'][1]))

360


In [30]:
print(tokenizer.decode(tokenized['input_ids'][0][-64:]))
print(tokenizer.decode(tokenized['input_ids'][1][134:134+64]))

 Peradeniya. Born to Sir Mohamed Macan Markar, he was educated at Royal College Colombo, where he played cricket for the college at the Royal-Thomian. Macan Markar went on to study medicine at the University of Ceylon, where he also represented the university at cricket. Graduating</s>
</s>, Peradeniya. Born to Sir Mohamed Macan Markar, he was educated at Royal College Colombo, where he played cricket for the college at the Royal-Thomian. Macan Markar went on to study medicine at the University of Ceylon, where he also represented the university at cricket. Grad
