In [1]:
import os
import functools

import numpy as np
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModelForTokenClassification

In [2]:
from transformers import LongformerTokenizerFast, DataCollatorWithPadding

tokenizer = LongformerTokenizerFast.from_pretrained("./data/longformer/", local_files_only=True, add_prefix_space=True)
# DataCollatorWithPadding pads each batch to the longest sequence length
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [3]:
text = [
    "This is a iterator driverless sample text for testing. How good is this beautification of grammarly?", 
    "This is a second text installation purposeful. Exclaimation is undress of poetry!"
] 
text_words = [item.split() for item in text]

In [10]:
result = tokenizer(
        text_words,
        is_split_into_words=True,
        max_length=15,
        padding=False, 
        truncation=True,
        return_offsets_mapping=True, 
        return_overflowing_tokens=True,
        stride=2
    )   

In [11]:
result["overflow_to_sample_mapping"]

[0, 0, 1, 1]

In [13]:
prev_sentence_id = -100
for text_index, (token_ids, sentence_id) in enumerate(zip(result["input_ids"], result["overflow_to_sample_mapping"])):
    if sentence_id != prev_sentence_id:
        print("\n########## NEW TEXT ##########")
    print("########## TOKEN_IDS ############")
    print(len(token_ids))
    print(token_ids)    
    print("########## TOKENS ############")
    tokens = tokenizer.convert_ids_to_tokens(token_ids)
    print(len(tokens))
    print(tokens)    
    print("########## WORD_IDS ############")   
    word_ids = result.word_ids(batch_index=text_index)
    print(len(word_ids))
    print(word_ids)    
    print("########## WORDS ############")
    words = text_words[sentence_id]    
    sub_text_words = [words[word_id] for word_id in word_ids if word_id is not None]
    print(len(sub_text_words))
    print(sub_text_words)
    prev_sentence_id = sentence_id


########## NEW TEXT ##########
########## TOKEN_IDS ############
15
[0, 152, 16, 10, 49757, 1393, 1672, 7728, 2788, 13, 3044, 4, 1336, 205, 2]
########## TOKENS ############
15
['<s>', 'ĠThis', 'Ġis', 'Ġa', 'Ġiterator', 'Ġdriver', 'less', 'Ġsample', 'Ġtext', 'Ġfor', 'Ġtesting', '.', 'ĠHow', 'Ġgood', '</s>']
########## WORD_IDS ############
15
[None, 0, 1, 2, 3, 4, 4, 5, 6, 7, 8, 8, 9, 10, None]
########## WORDS ############
13
['This', 'is', 'a', 'iterator', 'driverless', 'driverless', 'sample', 'text', 'for', 'testing.', 'testing.', 'How', 'good']
########## TOKEN_IDS ############
12
[0, 1336, 205, 16, 42, 28651, 5000, 9, 33055, 352, 116, 2]
########## TOKENS ############
12
['<s>', 'ĠHow', 'Ġgood', 'Ġis', 'Ġthis', 'Ġbeaut', 'ification', 'Ġof', 'Ġgrammar', 'ly', '?', '</s>']
########## WORD_IDS ############
12
[None, 9, 10, 11, 12, 13, 13, 14, 15, 15, 15, None]
########## WORDS ############
10
['How', 'good', 'is', 'this', 'beautification', 'beautification', 'of', 'grammarly?', 'gram