In [1]:
from transformers import GPT2LMHeadModel, GPT2TokenizerFast

In [2]:
# 12-layer, 768-hidden, 12-heads, 117M parameters.
# OpenAI GPT-2 English model
pretrained_weights = 'gpt2'
tokenizer = GPT2TokenizerFast.from_pretrained(pretrained_weights)
model = GPT2LMHeadModel.from_pretrained(pretrained_weights)

# GPT own tokenizer

In [3]:
type(tokenizer)

transformers.models.gpt2.tokenization_gpt2_fast.GPT2TokenizerFast

In [4]:
ids = tokenizer.encode('This is an example of text, this is another example of text. :), :/')
print(ids)

[1212, 318, 281, 1672, 286, 2420, 11, 428, 318, 1194, 1672, 286, 2420, 13, 1058, 828, 1058, 14]


In [5]:
tokenizer.decode(ids)

'This is an example of text, this is another example of text. :), :/'

In [6]:
print([tokenizer.decode([i]) for i in ids])
# ',' and ', ' are tokenized differently. No decoding for emoji

['This', ' is', ' an', ' example', ' of', ' text', ',', ' this', ' is', ' another', ' example', ' of', ' text', '.', ' :', '),', ' :', '/']


# HuggingFace preprocessing (tokenizer)

https://huggingface.co/transformers/preprocessing.html

In [8]:
text = ["Hello I'm a single sentence",
                    "And anot`her sentence",
                    "And the very very last one"]

In [9]:
tmp_token = GPT2TokenizerFast.from_pretrained(pretrained_weights)
batch = tmp_token(text)
print(batch)

{'input_ids': [[15496, 314, 1101, 257, 2060, 6827], [1870, 1194, 6827], [1870, 262, 845, 845, 938, 530]], 'attention_mask': [[1, 1, 1, 1, 1, 1], [1, 1, 1], [1, 1, 1, 1, 1, 1]]}


In [10]:
tmp_token = GPT2TokenizerFast.from_pretrained(pretrained_weights)
tmp_token.pad_token = tmp_token.eos_token
batch = tmp_token(text,padding=True,truncation=True,max_length=100,return_tensors="pt")
# with padding. Default GPT2 padding is to the right
print(batch)

{'input_ids': tensor([[15496,   314,  1101,   257,  2060,  6827],
        [ 1870,  1194,  6827, 50256, 50256, 50256],
        [ 1870,   262,   845,   845,   938,   530]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1],
        [1, 1, 1, 0, 0, 0],
        [1, 1, 1, 1, 1, 1]])}


In [11]:
for i in batch['input_ids']:
    print(tmp_token.decode(i))

Hello I'm a single sentence
And another sentence<|endoftext|><|endoftext|><|endoftext|>
And the very very last one


In [12]:
tmp_token = GPT2TokenizerFast.from_pretrained(pretrained_weights)
tmp_token.pad_token = tmp_token.eos_token
batch = tmp_token(text,padding=True,truncation=True,max_length=4,return_tensors="pt")
print(batch)
# truncation is also to the right
for i in batch['input_ids']:
    print(tmp_token.decode(i))

{'input_ids': tensor([[15496,   314,  1101,   257],
        [ 1870,  1194,  6827, 50256],
        [ 1870,   262,   845,   845]]), 'attention_mask': tensor([[1, 1, 1, 1],
        [1, 1, 1, 0],
        [1, 1, 1, 1]])}
Hello I'm a
And another sentence<|endoftext|>
And the very very


With a pair of sentences (useful for BERT), but we will play around with truncation and max_length here

In [13]:
batch_sentences = ["Hello I'm a single sentence",
                    "And another sentence",
                   "And the very very last one"]
batch_of_second_sentences = ["I'm a sentence that goes with the first sentence",
                             "And I should be encoded with the second sentence",
                             "And I go with the very last one"]

In [14]:
tmp_token = GPT2TokenizerFast.from_pretrained(pretrained_weights)
tmp_token.pad_token = tmp_token.eos_token
batch = tmp_token(batch_sentences,batch_of_second_sentences,padding=True,return_tensors="pt")
print(batch)
print(batch['input_ids'].shape)
for i in batch['input_ids']:
    print(tmp_token.decode(i))

{'input_ids': tensor([[15496,   314,  1101,   257,  2060,  6827,    40,  1101,   257,  6827,
           326,  2925,   351,   262,   717,  6827],
        [ 1870,  1194,  6827,  1870,   314,   815,   307, 30240,   351,   262,
          1218,  6827, 50256, 50256, 50256, 50256],
        [ 1870,   262,   845,   845,   938,   530,  1870,   314,   467,   351,
           262,   845,   938,   530, 50256, 50256]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0]])}
torch.Size([3, 16])
Hello I'm a single sentenceI'm a sentence that goes with the first sentence
And another sentenceAnd I should be encoded with the second sentence<|endoftext|><|endoftext|><|endoftext|><|endoftext|>
And the very very last oneAnd I go with the very last one<|endoftext|><|endoftext|>


In [15]:
batch['input_ids'].shape

torch.Size([3, 16])

In [16]:
tmp_token = GPT2TokenizerFast.from_pretrained(pretrained_weights)
tmp_token.pad_token = tmp_token.eos_token
batch = tmp_token(batch_sentences,batch_of_second_sentences,padding=True,truncation='only_first',max_length=12,return_tensors="pt")
print(batch)
print(batch['input_ids'].shape)
# truncate only the first sentence. Still truncate from the right
for i in batch['input_ids']:
    print(tmp_token.decode(i))

print(batch['input_ids'].shape)

{'input_ids': tensor([[15496,   314,    40,  1101,   257,  6827,   326,  2925,   351,   262,
           717,  6827],
        [ 1870,  1194,  6827,  1870,   314,   815,   307, 30240,   351,   262,
          1218,  6827],
        [ 1870,   262,   845,   845,  1870,   314,   467,   351,   262,   845,
           938,   530]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}
torch.Size([3, 12])
Hello II'm a sentence that goes with the first sentence
And another sentenceAnd I should be encoded with the second sentence
And the very veryAnd I go with the very last one
torch.Size([3, 12])


In [55]:
tmp_token = GPT2TokenizerFast.from_pretrained(pretrained_weights)
tmp_token.pad_token = tmp_token.eos_token
batch = tmp_token(batch_sentences,batch_of_second_sentences,padding=True,truncation='only_second',max_length=12,return_tensors="pt")
print(batch)
print(batch['input_ids'].shape)
# truncate only the second sentence. Still truncate from the right

for i in batch['input_ids']:
    print(tmp_token.decode(i))

{'input_ids': tensor([[15496,   314,  1101,   257,  2060,  6827,    40,  1101,   257,  6827,
           326,  2925],
        [ 1870,  1194,  6827,  1870,   314,   815,   307, 30240,   351,   262,
          1218,  6827],
        [ 1870,   262,   845,   845,   938,   530,  1870,   314,   467,   351,
           262,   845]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}
torch.Size([3, 12])
Hello I'm a single sentenceI'm a sentence that goes
And another sentenceAnd I should be encoded with the second sentence
And the very very last oneAnd I go with the very


In [56]:
tmp_token = GPT2TokenizerFast.from_pretrained(pretrained_weights)
tmp_token.pad_token = tmp_token.eos_token
batch = tmp_token(batch_sentences,batch_of_second_sentences,padding=True,truncation='longest_first',max_length=12,return_tensors="pt")
print(batch)
print(batch['input_ids'].shape)
# truncate the longest sentence of the two. Still truncate from the right

for i in batch['input_ids']:
    print(tmp_token.decode(i))

{'input_ids': tensor([[15496,   314,  1101,   257,  2060,  6827,    40,  1101,   257,  6827,
           326,  2925],
        [ 1870,  1194,  6827,  1870,   314,   815,   307, 30240,   351,   262,
          1218,  6827],
        [ 1870,   262,   845,   845,   938,   530,  1870,   314,   467,   351,
           262,   845]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}
torch.Size([3, 12])
Hello I'm a single sentenceI'm a sentence that goes
And another sentenceAnd I should be encoded with the second sentence
And the very very last oneAnd I go with the very


Can also work with pre-tokenized inputs (where sentence has already split into words), good for NER or POS

In [20]:
tmp_token = GPT2TokenizerFast.from_pretrained(pretrained_weights,add_prefix_space=True)

batch_sentences = [["Hello", "I'm", "a", "single", "sentence"],
                   ["And", "another", "sentence"],
                   ["And", "the", "very", "very", "last", "one"]]
batch_of_second_sentences = [["I'm", "a", "sentence", "that", "goes", "with", "the", "first", "sentence"],
                             ["And", "I", "should", "be", "encoded", "with", "the", "second", "sentence"],
                             ["And", "I", "go", "with", "the", "very", "last", "one"]]
batch = tmp_token(batch_sentences, batch_of_second_sentences, is_split_into_words=True)
print(batch)
# print(batch['input_ids'].shape)

for i in batch['input_ids']:
    print(tmp_token.decode(i))

{'input_ids': [[18435, 314, 1101, 257, 2060, 6827, 314, 1101, 257, 6827, 326, 2925, 351, 262, 717, 6827], [843, 1194, 6827, 843, 314, 815, 307, 30240, 351, 262, 1218, 6827], [843, 262, 845, 845, 938, 530, 843, 314, 467, 351, 262, 845, 938, 530]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]}
 Hello I'm a single sentence I'm a sentence that goes with the first sentence
 And another sentence And I should be encoded with the second sentence
 And the very very last one And I go with the very last one
