In [74]:
import json
import torch
import itertools
import operator
from dataclasses import dataclass
from transformers import AutoTokenizer, AutoModel
from seqlbtoolkit.embs import build_bert_token_embeddings
from seqlbtoolkit.text import split_overlength_bert_input_sequence

from tokenizations import get_alignments


In [75]:
tokenizer = AutoTokenizer.from_pretrained('roberta-base', add_prefix_space=True)
# model = AutoModel.from_pretrained('bert-base-uncased')


In [85]:
tks = 'I have an apple.'.split()

enc_tks = tokenizer([['a', 'b'], ['a', 'b']], add_special_tokens=True, is_split_into_words=True)


In [23]:
get_alignments(tks, enc_tks)

([[1], [2], [3], [4, 5]], [[], [0], [1], [2], [3], [3], []])

In [19]:
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased', add_prefix_space=True)


In [28]:
tokenizer.convert_ids_to_tokens(tks.input_ids)

['[CLS]', 'i', 'have', 'an', 'apple', '.', '[SEP]']

In [50]:
tks = tokenizer(['I have an apple. [PAD]', 'I have nothing'], add_special_tokens=True)
tks

{'input_ids': [[101, 1045, 2031, 2019, 6207, 1012, 0, 102], [101, 1045, 2031, 2498, 102]], 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1]]}

In [71]:
d = {'input_ids': [[101, 1045, 2031, 2019, 6207, 1012, 0, 102], [101, 1045, 2031, 2498, 102]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1]], 'labels':[[1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1]]}

In [73]:
type(tokenizer.pad(d))

transformers.tokenization_utils_base.BatchEncoding

In [34]:
x = torch.full((10, ), -100)

In [36]:
x[[2,3,5,6]] = torch.tensor([1,2,3,4])

In [38]:
x[[2,3,5,6]] = 0

In [39]:
x

tensor([-100, -100,    0,    0, -100,    0,    0, -100, -100, -100])

In [20]:
tokenizer.tokenize('I have an apple.', add_special_tokens=True)

['[CLS]', 'i', 'have', 'an', 'apple', '.', '[SEP]']

In [2]:
with open('a.json', 'r', encoding='utf-8') as f:
    text_seq = json.load(f)

In [6]:
encs = tokenizer(text_seq,
                 is_split_into_words=True,
                 add_special_tokens=True,
                 return_offsets_mapping=True)


In [28]:
max_seq_length = 512
tk_seq_list = [text_seq]
sent_lengths_list = [None]

split_tk_seq_list = list()
ori2split_ids_map = list()
n = 0

# update input sentences so that every sentence has BERT length < 510
for tk_seq, sent_lens in zip(tk_seq_list, sent_lengths_list):

    if sent_lens:
        ends = list(itertools.accumulate(sent_lens, operator.add))
        starts = [0] + ends[:-1]
        tk_seq_ = [tk_seq[s:e] for s, e in zip(starts, ends)]
    else:
        tk_seq_ = [tk_seq]

    tk_seqs = split_overlength_bert_input_sequence(tk_seq_, tokenizer, max_seq_length)
    n_splits = len(tk_seqs)
    split_tk_seq_list += tk_seqs

    ori2split_ids_map.append(list(range(n, n + n_splits)))
    n += n_splits


In [20]:
len(split_tk_seq_list[0])

367

In [21]:
len(text_seq)

366

In [23]:
len(tk_seq)

366

In [29]:
len(tk_seqs[0])

366