In [1]:
!pip install -q accelerate==0.21.0 peft==0.4.0 bitsandbytes==0.40.2 transformers==4.31.0 trl==0.4.7

In [10]:
import os
import torch
from datasets import load_dataset
from transformers import  AutoTokenizer



1. ### Create Llama-2-7b-chat-hf tokenizer

In [11]:
model_name = "NousResearch/Llama-2-7b-chat-hf"
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

In [12]:
type(tokenizer)

transformers.models.llama.tokenization_llama_fast.LlamaTokenizerFast

### 2. Look into token encoding

In [13]:
tokenizer.encode("this is a dog")

[1, 445, 338, 263, 11203]

Let us try to decode this vector

In [14]:
tokenizer.decode(1), tokenizer.decode(445), tokenizer.decode(338),tokenizer.decode(263), tokenizer.decode(11203)

('<s>', 'this', 'is', 'a', 'dog')

We can see the mapping  'this' -> 445  'is' -> '338'  'a'->263 'dog' -> 11203,   token_id:1 i.e. `<s>` has been inserted into the start

we can even play it around further:

In [19]:
tokenizer.vocab["this"]

1366

we can see the value is not encoded value  445, why?

In [33]:
for k, v in tokenizer.vocab.items():
    if v in (1366, 445):
        print(k)

▁this
this


In [32]:
tokenizer.decode(1366), tokenizer.decode(445)

('this', 'this')

Interesting, there are two "this", one is '_this' 1366, the other is 445 'this'

The same for 'dog', the value is not 11203 we saw

In [38]:
tokenizer.vocab["dog"]

26169

Now the question is: how many different 'this' or 'dogs'

In [35]:
for k, v in tokenizer.vocab.items():
    if v in (26169, 11203):
        print(k)

dog
▁dog


In [36]:
for k, v in tokenizer.vocab.items():
    if tokenizer.decode(v) == 'this':
        print(v)

445
1366


In [37]:
for k, v in tokenizer.vocab.items():
    if tokenizer.decode(v) == 'dog':
        print(v)

26169
11203


### 3. Further looking into encoding

In [39]:
tokenizer.encode("this malware is Emotet")

[1, 445, 4439, 2519, 338, 2812, 327, 300]

Let us check how malware, Emotet are encoded

In [41]:
tokenizer.vocab["malware"]

KeyError: 'malware'

Clearly, malware is not in the vocabulary of tokenizer, so how is it encoded then?

In [40]:
tokenizer.decode(4439), tokenizer.decode(2519)

('mal', 'ware')

Now we can, it has been splitted into 2 part, "mal", and "ware"

The same for Emotet:

In [43]:
tokenizer.decode(2812), tokenizer.decode(327), tokenizer.decode(300)

('Em', 'ot', 'et')

It has been split into 3 tokens. Now you know what token really means in LLM, usually it is one word, but not always