In [1]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel

tokenizer = GPT2Tokenizer.from_pretrained("gpt2-large")
text = "In a shocking finding, scientists"
inputs = tokenizer.encode(text, return_tensors="pt")
model = GPT2LMHeadModel.from_pretrained("gpt2-large")

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# check out how tokenized input looks like
inputs.shape, inputs

(torch.Size([1, 6]), tensor([[  818,   257, 14702,  4917,    11,  5519]]))

In [8]:
# turning tokens back into text
print(tokenizer.decode([818]))

In


In [9]:
# seeing how tokenization is done
for token in inputs[0]:
    print(tokenizer.decode([token]))

In
 a
 shocking
 finding
,
 scientists


In [3]:
tokenizer.vocab_size

50257

In [11]:
# begin of sentence token id is slightly different!
tokenizer.encode("the the the")

[1169, 262]

In [12]:
tokenizer.decode([1169]), tokenizer.decode([262])

('the', ' the')

In [14]:
# some characters may spand multiple tokens
tokenizer.encode("嗜")

[161, 245, 250]

In [17]:
# tokenization is case sensitive
tokenizer.encode("c c C C")

[66, 269, 327, 327]

In [16]:
tokenizer.decode([161]), tokenizer.decode([245]), tokenizer.decode([250])

('�', '�', '�')

In [6]:
outputs = model.generate(inputs, max_length=250, num_return_sequences=1, pad_token_id=tokenizer.eos_token_id)
print(tokenizer.decode(outputs[0], skip_special_tokens=True), end="")

The attention mask is not set and cannot be inferred from input because pad token is same as eos token.As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


In a shocking finding, scientists have found that the brain of a baby born with Down syndrome is not fully developed until the age of two.

The study, published in the journal Nature, found that the brain of a baby born with Down syndrome is not fully developed until the age of two.

The findings, which were made by scientists at the University of California, San Francisco, and the University of California, San Diego, are the first to show that the brain of a baby born with Down syndrome is not fully developed until the age of two.

The study, which was carried out on mice, found that the brain of a baby born with Down syndrome is not fully developed until the age of two. The findings, which were made by scientists at the University of California, San Francisco, and the University of California, San Diego, are the first to show that the brain of a baby born with Down syndrome is not fully developed until the age of two. The study, which was carried out on mice, found that the brain of 

In [20]:
# calling actual model
preds = model(inputs).logits
inputs.shape, preds.shape, preds

(torch.Size([1, 6]),
 torch.Size([1, 6, 50257]),
 tensor([[[-0.2192,  1.7349, -2.3250,  ..., -5.6592, -7.6884, -0.9767],
          [-1.1942, -0.1058, -4.2319,  ..., -6.0229, -5.4356, -0.9255],
          [ 0.5741,  1.6945, -3.0760,  ..., -4.9000, -7.8580, -1.5279],
          [ 2.0346,  2.1569, -3.5765,  ..., -5.8734, -6.3885, -0.0463],
          [-1.7774, -1.2454, -4.8257,  ..., -2.1206, -6.7244, -1.6851],
          [ 0.7919,  0.9766, -5.5937,  ..., -5.7744, -1.8508,  0.0856]]],
        grad_fn=<UnsafeViewBackward0>))

In [21]:
# print out generated token
tokenizer.decode(preds[0][-1].argmax())

' have'