In [1]:
import os
import functools

import numpy as np
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModelForTokenClassification
import transformer_utils as tu

In [2]:
from transformers import LongformerTokenizerFast, DataCollatorWithPadding

tokenizer = LongformerTokenizerFast.from_pretrained("./data/longformer/", local_files_only=True, add_prefix_space=True)
# DataCollatorWithPadding pads each batch to the longest sequence length
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [3]:
test_text = [
    "This is a iterator driverless sample text for testing. How good is this beautification of grammarly?", 
    "This is a second text installation purposeful. Exclaimation is undress of poetry!"
] 
text_words = [item.split() for item in test_text]

In [4]:
result = tokenizer(
        text_words,
        is_split_into_words=True,
        max_length=15,
        padding=False, 
        truncation=True,
        return_offsets_mapping=True, 
        return_overflowing_tokens=True,
        stride=2
    )   

In [5]:
result

{'input_ids': [[0, 152, 16, 10, 49757, 1393, 1672, 7728, 2788, 13, 3044, 4, 1336, 205, 2], [0, 1336, 205, 16, 42, 28651, 5000, 9, 33055, 352, 116, 2], [0, 152, 16, 10, 200, 2788, 8809, 3508, 2650, 4, 3015, 31628, 1258, 16, 2], [0, 1258, 16, 2432, 5224, 9, 14665, 328, 2]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1]], 'offset_mapping': [[(0, 0), (1, 4), (1, 2), (1, 1), (1, 8), (1, 6), (6, 10), (1, 6), (1, 4), (1, 3), (1, 7), (7, 8), (1, 3), (1, 4), (0, 0)], [(0, 0), (1, 3), (1, 4), (1, 2), (1, 4), (1, 5), (5, 14), (1, 2), (1, 7), (7, 9), (9, 10), (0, 0)], [(0, 0), (1, 4), (1, 2), (1, 1), (1, 6), (1, 4), (1, 12), (1, 7), (7, 10), (10, 11), (1, 2), (2, 7), (7, 12), (1, 2), (0, 0)], [(0, 0), (7, 12), (1, 2), (1, 3), (3, 7), (1, 2), (1, 6), (6, 7), (0, 0)]], 'overflow_to_sample_mapping': [0, 0, 1, 1]}

In [6]:
result["overflow_to_sample_mapping"]

[0, 0, 1, 1]

In [7]:
result["input_ids"]

[[0, 152, 16, 10, 49757, 1393, 1672, 7728, 2788, 13, 3044, 4, 1336, 205, 2],
 [0, 1336, 205, 16, 42, 28651, 5000, 9, 33055, 352, 116, 2],
 [0, 152, 16, 10, 200, 2788, 8809, 3508, 2650, 4, 3015, 31628, 1258, 16, 2],
 [0, 1258, 16, 2432, 5224, 9, 14665, 328, 2]]

In [8]:
prev_sentence_id = -100
for text_index, (token_ids, sentence_id) in enumerate(zip(result["input_ids"], result["overflow_to_sample_mapping"])):
    if sentence_id != prev_sentence_id:
        print("\n########## NEW TEXT ##########")
    print("########## TOKEN_IDS ############")
    print(len(token_ids))
    print(token_ids)    
    print("########## TOKENS ############")
    tokens = tokenizer.convert_ids_to_tokens(token_ids)
    print(len(tokens))
    print(tokens)    
    print("########## WORD_IDS ############")   
    word_ids = result.word_ids(batch_index=text_index)
    print(len(word_ids))
    print(word_ids)    
    print("########## WORDS ############")
    words = text_words[sentence_id]    
    sub_text_words = [words[word_id] for word_id in word_ids if word_id is not None]
    print(len(sub_text_words))
    print(sub_text_words)
    prev_sentence_id = sentence_id


########## NEW TEXT ##########
########## TOKEN_IDS ############
15
[0, 152, 16, 10, 49757, 1393, 1672, 7728, 2788, 13, 3044, 4, 1336, 205, 2]
########## TOKENS ############
15
['<s>', 'ĠThis', 'Ġis', 'Ġa', 'Ġiterator', 'Ġdriver', 'less', 'Ġsample', 'Ġtext', 'Ġfor', 'Ġtesting', '.', 'ĠHow', 'Ġgood', '</s>']
########## WORD_IDS ############
15
[None, 0, 1, 2, 3, 4, 4, 5, 6, 7, 8, 8, 9, 10, None]
########## WORDS ############
13
['This', 'is', 'a', 'iterator', 'driverless', 'driverless', 'sample', 'text', 'for', 'testing.', 'testing.', 'How', 'good']
########## TOKEN_IDS ############
12
[0, 1336, 205, 16, 42, 28651, 5000, 9, 33055, 352, 116, 2]
########## TOKENS ############
12
['<s>', 'ĠHow', 'Ġgood', 'Ġis', 'Ġthis', 'Ġbeaut', 'ification', 'Ġof', 'Ġgrammar', 'ly', '?', '</s>']
########## WORD_IDS ############
12
[None, 9, 10, 11, 12, 13, 13, 14, 15, 15, 15, None]
########## WORDS ############
10
['How', 'good', 'is', 'this', 'beautification', 'beautification', 'of', 'grammarly?', 'gram

In [9]:
# def tokenize_text(tokenizer, data_row):
#     # convert the text to word tokens splitting on " "        
#     print(data_row["text"])
#     if isinstance(data_row["text"], list) :
#         text_words = [item.split() for item in data_row["text"]]
#     else:
#         text_words = data_row["text"].split()        
#     print(text_words)
#     encoding = tokenizer(
#         text_words, 
#         is_split_into_words=True,
#         max_length=15,
#         padding=False, 
#         truncation=True,
#         return_offsets_mapping=True, 
#         return_overflowing_tokens=True,
#         stride=2
#     ) 
#     word_ids = []
#     for idx, token_ids in enumerate(encoding["input_ids"]):
#         # The word_id for CLS, SEP special tokens in None, we need to change is the special id of -100 so that
#         # encoding can be converted to a tensor during batching        
#         word_ids.append([-100 if wordid == None else wordid for wordid in encoding.word_ids(batch_index=idx)])        
#     encoding["word_ids"] = word_ids       
#     return encoding      

In [10]:
def tokenize_text(tokenizer, data_row):    
    # convert the text to word tokens splitting on " "
    # Batch tokenization scenario
    if isinstance(data_row["text"], list) :
        text_words = [item.split() for item in data_row["text"]]
    # tokenize a single data row
    else:
        text_words = data_row["text"].split()        

    encoding = tokenizer(
        text_words, 
        is_split_into_words=True,
        max_length=15,
        padding=False, 
        truncation=True,
        return_offsets_mapping=True, 
        return_overflowing_tokens=True,
        stride=2
    ) 
    word_ids = []
    essay_ids = []    
    for idx, (token_ids, text_id) in enumerate(zip(encoding["input_ids"], encoding["overflow_to_sample_mapping"])):        
        if isinstance(data_row["text"], list): 
            eid = data_row["essay_id"][text_id]
        else:
            eid = data_row["essay_id"]            
        essay_ids.append(eid)
        # The word_id for CLS, SEP special tokens in None, we need to change it to the special id of -100 so that
        # encoding can be converted to a tensor during batching        
        word_ids.append([-100 if wordid == None else wordid for wordid in encoding.word_ids(batch_index=idx)])                
    encoding["word_ids"] = word_ids
    encoding["essay_id"] = essay_ids
    return encoding      

In [11]:
from functools import partial
from transformers import AutoTokenizer, DataCollatorForTokenClassification

tokenizer = AutoTokenizer.from_pretrained("allenai/longformer-base-4096", add_prefix_space=True)
# DataCollatorWithPadding pads each batch to the longest sequence length
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

In [12]:
df_test = pd.DataFrame({
    "essay_id": pd.Series([], dtype="int64"), 
    "id": pd.Series([], dtype="object"), 
    "text": pd.Series([], dtype="object"),
    "discourse_type": pd.Series([], dtype="object"),
    "prediction_string": pd.Series([], dtype="object")
    })

In [15]:
# load and read text from test directory
essay_id = 0
for id, text in zip(["testid1", "testid2"], test_text):        
    print(f"id = {id}, essay_id = {essay_id},  text = {text}")
    test_row = pd.Series({
        "essay_id": essay_id,
        "id": id,
        "text": text,
        "discourse_type": None,
        "prediction_string": None
    })
    #encoding = preprocess_test_data(test_row)    
    df_test = df_test.append(test_row, ignore_index=True)
    essay_id += 1

id = testid1, essay_id = 0,  text = This is a iterator driverless sample text for testing. How good is this beautification of grammarly?
id = testid2, essay_id = 1,  text = This is a second text installation purposeful. Exclaimation is undress of poetry!


In [16]:
df_test

Unnamed: 0,essay_id,id,text,discourse_type,prediction_string
0,0,testid1,This is a iterator driverless sample text for ...,,
1,1,testid2,This is a second text installation purposeful....,,


In [1]:
import os
from torch.utils.data import Dataset, DataLoader
import datasets
#from datasets import Dataset

In [None]:


# Create data loader for test data
preprocess_test_data = partial(tokenize_text, tokenizer)
ds_test_raw = datasets.Dataset.from_pandas(df_test)
ds_test_raw_col_names = ds_test_raw.column_names
ds_test = ds_test_raw.map(preprocess_test_data, batched=True, remove_columns=ds_test_raw_col_names)
dl_test = DataLoader(ds_test, batch_size=2, collate_fn=data_collator, num_workers=2)

In [21]:
ds_test["input_ids"]

[[0, 152, 16, 10, 49757, 1393, 1672, 7728, 2788, 13, 3044, 4, 1336, 205, 2],
 [0, 1336, 205, 16, 42, 28651, 5000, 9, 33055, 352, 116, 2],
 [0, 152, 16, 10, 200, 2788, 8809, 3508, 2650, 4, 3015, 31628, 1258, 16, 2],
 [0, 1258, 16, 2432, 5224, 9, 14665, 328, 2]]

In [22]:
ds_test["word_ids"]

[[-100, 0, 1, 2, 3, 4, 4, 5, 6, 7, 8, 8, 9, 10, -100],
 [-100, 9, 10, 11, 12, 13, 13, 14, 15, 15, 15, -100],
 [-100, 0, 1, 2, 3, 4, 5, 6, 6, 6, 7, 7, 7, 8, -100],
 [-100, 7, 8, 9, 9, 10, 11, 11, -100]]

In [36]:
df_test.loc[0, ["discourse_type", "prediction_string"]] = [["test_discourse"], [0,1,2]]

  arr_value = np.array(value)


In [37]:
df_test

Unnamed: 0,id,text,discourse_type,prediction_string
0,testid1,This is a iterator driverless sample text for ...,[test_discourse],"[0, 1, 2]"
1,testid2,This is a second text installation purposeful....,,


In [38]:
df_test["record_id"] = pd.Series([0,1])

In [39]:
for id, text in zip(["testid1", "testid2"], test_text):
    print(id, text)

testid1 This is a iterator driverless sample text for testing. How good is this beautification of grammarly?
testid2 This is a second text installation purposeful. Exclaimation is undress of poetry!


In [40]:
test_dict = {"id": ["test_id1"], "text": ["test text"], "counter": 0}

list(test_dict.values())

[['test_id1'], ['test text'], 0]

In [41]:
test_dict["counter"] = test_dict["counter"] + 1
test_dict

{'id': ['test_id1'], 'text': ['test text'], 'counter': 1}

In [42]:
[1] * 5

[1, 1, 1, 1, 1]