In [1]:
from importlib.metadata import version
import tiktoken

print("tiktoken version:", version("tiktoken"))

tiktoken version: 0.9.0


In [2]:
tokenizer = tiktoken.get_encoding("gpt2")

In [4]:
text = (
    "Hello, do you like tea? <|endoftext|> In the sunlit terraces"
    "of someunknownPlace."
)

try:
    ids = tokenizer.encode(text)
    print(ids)
except ValueError as e:
    print(e)


ids = tokenizer.encode(text, allowed_special={"<|endoftext|>"})
print(ids)

Encountered text corresponding to disallowed special token '<|endoftext|>'.
If you want this text to be encoded as a special token, pass it to `allowed_special`, e.g. `allowed_special={'<|endoftext|>', ...}`.
If you want this text to be encoded as normal text, disable the check for this token by passing `disallowed_special=(enc.special_tokens_set - {'<|endoftext|>'})`.
To disable this check for all special tokens, pass `disallowed_special=()`.

[15496, 11, 466, 345, 588, 8887, 30, 220, 50256, 554, 262, 4252, 18250, 8812, 2114, 1659, 617, 34680, 27271, 13]


In [5]:
decoded_text = tokenizer.decode(ids)
print(decoded_text)

Hello, do you like tea? <|endoftext|> In the sunlit terracesof someunknownPlace.


In [6]:
unknown_words = "Akwirw ier"
unknown_words_ids = tokenizer.encode(unknown_words)

for id in unknown_words_ids:
    print(f"id: {id}, token: {tokenizer.decode([id])}")

id: 33901, token: Ak
id: 86, token: w
id: 343, token: ir
id: 86, token: w
id: 220, token:  
id: 959, token: ier


In [7]:
with open("the-verdict.txt", "r", encoding="utf-8") as f:
    raw_text = f.read()

tokenized_raw_text = tokenizer.encode(raw_text)
print(len(tokenized_raw_text))

5145


In [8]:
tokenized_sample = tokenized_raw_text[50:]

In [10]:
context_size = 4
x = tokenized_sample[:context_size]
y = tokenized_sample[1:context_size+1]

print(f"x: {x}")
print(f"y:      {y}")

x: [290, 4920, 2241, 287]
y:      [4920, 2241, 287, 257]


In [14]:
for i in range(1, context_size+1):
    context = tokenized_sample[:i]
    target = tokenized_sample[i]
    print(context, "---->", target) 
    print(tokenizer.decode(context), "---->", tokenizer.decode([target]))

[290] ----> 4920
 and ---->  established
[290, 4920] ----> 2241
 and established ---->  himself
[290, 4920, 2241] ----> 287
 and established himself ---->  in
[290, 4920, 2241, 287] ----> 257
 and established himself in ---->  a
