In [1]:
from transformers import AutoTokenizer

In [3]:
PRE_TRAINED_MODEL_NAME = 'bert-base-cased'

In [4]:
tokenizer = AutoTokenizer.from_pretrained(PRE_TRAINED_MODEL_NAME)

In [5]:
sample_book1_desc = 'An awesome book about the nature. There is no more you can have!'
sample_book2_desc = 'Do you want to experience some horror stories, if yes - do not hesitate to try this book out!'

In [6]:
tokens = tokenizer.tokenize(sample_book1_desc)
token_ids = tokenizer.convert_tokens_to_ids(tokens)

print(f' Sentence: {sample_book1_desc}')
print(f'   Tokens: {tokens}')
print(f'Token IDs: {token_ids}')

 Sentence: An awesome book about the nature. There is no more you can have!
   Tokens: ['An', 'awesome', 'book', 'about', 'the', 'nature', '.', 'There', 'is', 'no', 'more', 'you', 'can', 'have', '!']
Token IDs: [1760, 14918, 1520, 1164, 1103, 2731, 119, 1247, 1110, 1185, 1167, 1128, 1169, 1138, 106]


In [8]:
(tokenizer.sep_token, tokenizer.sep_token_id), (tokenizer.cls_token, tokenizer.cls_token_id), (tokenizer.pad_token, tokenizer.pad_token_id), (tokenizer.unk_token, tokenizer.unk_token_id)

(('[SEP]', 102), ('[CLS]', 101), ('[PAD]', 0), ('[UNK]', 100))

In [9]:
encoded_input = tokenizer(sample_book1_desc, sample_book2_desc)
print(encoded_input)
tokenizer.decode(encoded_input["input_ids"])

{'input_ids': [101, 1760, 14918, 1520, 1164, 1103, 2731, 119, 1247, 1110, 1185, 1167, 1128, 1169, 1138, 106, 102, 2091, 1128, 1328, 1106, 2541, 1199, 5367, 2801, 117, 1191, 4208, 118, 1202, 1136, 17467, 1106, 2222, 1142, 1520, 1149, 106, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}


"[CLS] An awesome book about the nature. There is no more you can have! [SEP] Do you want to experience some horror stories, if yes - don't hesitate to try this book out! [SEP]"

In [10]:
batch_sentences = ["Hello I'm a single sentence. You are not!",
                   "And another sentence",
                   "And the very very last one"]
batch_of_second_sentences = ["I'm a sentence that goes with the first sentence",
                             "And I should be encoded with the second sentence",
                             "And I go with the very last one"]

In [11]:
encoded_inputs = tokenizer(batch_sentences, batch_of_second_sentences, padding='longest', truncation='longest_first', return_tensors='pt')
print(encoded_inputs)

{'input_ids': tensor([[  101,  8667,   146,   112,   182,   170,  1423,  5650,   119,  1192,
          1132,  1136,   106,   102,   146,   112,   182,   170,  5650,  1115,
          2947,  1114,  1103,  1148,  5650,   102],
        [  101,  1262,  1330,  5650,   102,  1262,   146,  1431,  1129, 12544,
          1114,  1103,  1248,  5650,   102,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0],
        [  101,  1262,  1103,  1304,  1304,  1314,  1141,   102,  1262,   146,
          1301,  1114,  1103,  1304,  1314,  1141,   102,     0,     0,     0,
             0,     0,     0,     0,     0,     0]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1],
        [0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0,
         0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1

In [12]:
print(encoded_inputs['input_ids'].shape)
for ids in encoded_inputs['input_ids']:
    print(len(ids))
    print(tokenizer.decode(ids))

torch.Size([3, 26])
26
[CLS] Hello I'm a single sentence. You are not! [SEP] I'm a sentence that goes with the first sentence [SEP]
26
[CLS] And another sentence [SEP] And I should be encoded with the second sentence [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]
26
[CLS] And the very very last one [SEP] And I go with the very last one [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]
