In [2]:
import numpy as np 
import tensorflow as tf 
import pandas as pd 

In [3]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

In [74]:
## Creating dummy data

text_1 = "This is a first test. The test contains 3 sentences. This is the last sentence. We have a second claim here, verycompliexword so strange. And we finally have our wonderful conclusion"
labels_1 = ["Claim","Evidence","Claim",'Claim','Conclusion']
X = pd.DataFrame([text_1.split('.',),labels_1],index=['text','label']).T

X['discourse_len'] = X['text'].apply(lambda text :len(text.split()))

X['predictionstring_start'] = X.discourse_len.cumsum().shift(+1).fillna(0).astype(int)
X['predictionstring_end'] = X.discourse_len.cumsum()-1

f = np.vectorize(lambda a,b : ' '.join(list(map(str,range(a,b+1)))),otypes=[str])
X['predictionstring'] = f(X['predictionstring_start'],X['predictionstring_end'])

X.drop(['discourse_len','predictionstring_start','predictionstring_end'],axis=1,inplace=True)

X

Unnamed: 0,text,label,predictionstring
0,This is a first test,Claim,0 1 2 3 4
1,The test contains 3 sentences,Evidence,5 6 7 8 9
2,This is the last sentence,Claim,10 11 12 13 14
3,"We have a second claim here, verycompliexword...",Claim,15 16 17 18 19 20 21 22 23
4,And we finally have our wonderful conclusion,Conclusion,24 25 26 27 28 29 30


In [75]:
## Creating tokens and visualize
tokens = tokenizer(text_1,return_attention_mask=True,return_offsets_mapping=True,return_token_type_ids=False,
                  padding = 'max_length',max_length=45,truncation=True)


pd.DataFrame({'text':[text_1[pos[0]:pos[1]] for pos in tokens['offset_mapping']],'offset':tokens['offset_mapping']},index=pd.Index(tokens['input_ids'])).T

Unnamed: 0,101,2023,2003,1037,2034,3231,1012,1996,3231.1,3397,1017,11746,1012.1,2023.1,2003.1,1996.1,2197,6251,1012.2,2057,2031,1037.1,2117,4366,2182,1010,2200,9006,24759,2666,2595,18351,2061,4326,1012.3,1998,2057.1,2633,2031.1,2256,6919,7091,102,0,0.1
text,,This,is,a,first,test,.,The,test,contains,3,sentences,.,This,is,the,last,sentence,.,We,have,a,second,claim,here,",",very,com,pl,ie,x,word,so,strange,.,And,we,finally,have,our,wonderful,conclusion,,,
offset,"(0, 0)","(0, 4)","(5, 7)","(8, 9)","(10, 15)","(16, 20)","(20, 21)","(22, 25)","(26, 30)","(31, 39)","(40, 41)","(42, 51)","(51, 52)","(53, 57)","(58, 60)","(61, 64)","(65, 69)","(70, 78)","(78, 79)","(80, 82)","(83, 87)","(88, 89)","(90, 96)","(97, 102)","(103, 107)","(107, 108)","(109, 113)","(113, 116)","(116, 118)","(118, 120)","(120, 121)","(121, 125)","(126, 128)","(129, 136)","(136, 137)","(138, 141)","(142, 144)","(145, 152)","(153, 157)","(158, 161)","(162, 171)","(172, 182)","(0, 0)","(0, 0)","(0, 0)"


In [83]:
## Creating tokens and visualize
tokens = tokenizer(text_1.split(),return_attention_mask=True,return_token_type_ids=False,
                  padding = 'max_length',max_length=45,truncation=True,is_split_into_words=True)


#pd.DataFrame({'text':[text_1.split()[pos[0]:pos[1]] for pos in tokens['offset_mapping']],'offset':tokens['offset_mapping']},index=pd.Index(tokens['input_ids']))

tokenizer.convert_ids_to_tokens(tokens['input_ids'])

['[CLS]',
 'this',
 'is',
 'a',
 'first',
 'test',
 '.',
 'the',
 'test',
 'contains',
 '3',
 'sentences',
 '.',
 'this',
 'is',
 'the',
 'last',
 'sentence',
 '.',
 'we',
 'have',
 'a',
 'second',
 'claim',
 'here',
 ',',
 'very',
 '##com',
 '##pl',
 '##ie',
 '##x',
 '##word',
 'so',
 'strange',
 '.',
 'and',
 'we',
 'finally',
 'have',
 'our',
 'wonderful',
 'conclusion',
 '[SEP]',
 '[PAD]',
 '[PAD]']

In [85]:
tokens.word_ids()

[None,
 0,
 1,
 2,
 3,
 4,
 4,
 5,
 6,
 7,
 8,
 9,
 9,
 10,
 11,
 12,
 13,
 14,
 14,
 15,
 16,
 17,
 18,
 19,
 20,
 20,
 21,
 21,
 21,
 21,
 21,
 21,
 22,
 23,
 23,
 24,
 25,
 26,
 27,
 28,
 29,
 30,
 None,
 None,
 None]

In [86]:
tokens

{'input_ids': [101, 2023, 2003, 1037, 2034, 3231, 1012, 1996, 3231, 3397, 1017, 11746, 1012, 2023, 2003, 1996, 2197, 6251, 1012, 2057, 2031, 1037, 2117, 4366, 2182, 1010, 2200, 9006, 24759, 2666, 2595, 18351, 2061, 4326, 1012, 1998, 2057, 2633, 2031, 2256, 6919, 7091, 102, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0]}