#### tiktoken is a fast BPE tokeniser for use with OpenAI's models.
##### https://github.com/openai/tiktoken

In [7]:
pip install tiktoken

Note: you may need to restart the kernel to use updated packages.


#### Step 1: Load a tokenizer for a model (e.g., GPT-3.5)

In [8]:
encoding = tiktoken.encoding_for_model("gpt-3.5-turbo")



#### Step 2: Text to encode

In [9]:
text = "Hello world! I am a Software Engineer."

#### Step 3: Encode (text -> token IDs)

In [10]:
token_ids = encoding.encode(text)
print("Token IDs:", token_ids)

Token IDs: [9906, 1917, 0, 358, 1097, 264, 4476, 29483, 13]


#### Step 4: Decode (token IDs -> text)

In [11]:
decoded_text = encoding.decode(token_ids)
print("Decoded text:", decoded_text)

Decoded text: Hello world! I am a Software Engineer.


#### Step 5: Show tokens with their IDs

In [12]:
tokens = [encoding.decode([tid]) for tid in token_ids]
print("\nTokens with IDs:")
for tid, tok in zip(token_ids, tokens):
    print(f"{tid}: {repr(tok)}")


Tokens with IDs:
9906: 'Hello'
1917: ' world'
0: '!'
358: ' I'
1097: ' am'
264: ' a'
4476: ' Software'
29483: ' Engineer'
13: '.'
