# Tokenizers

In [23]:
from rich import print
from transformers import BertTokenizer
import fastcore.all as fc

In [11]:
tokenizer1 = BertTokenizer.from_pretrained('bert-base-cased')

In [12]:
haiku = '''
    An old silent pond
    A frog jumps into the pond—
    Splash! Silence again.
    '''

In [20]:
output1 = tokenizer1(haiku); print(output1)

In [14]:
from transformers import AutoTokenizer

In [15]:
tokenizer2 = AutoTokenizer.from_pretrained('bert-base-cased')

In [21]:
output2 = tokenizer2(haiku); print(output2)

In [18]:
assert output1.input_ids == output2.input_ids

In [22]:
tokenizer2.save_pretrained('tokenizers/bert-base-cased')

('tokenizers/bert-base-cased/tokenizer_config.json',
 'tokenizers/bert-base-cased/special_tokens_map.json',
 'tokenizers/bert-base-cased/vocab.txt',
 'tokenizers/bert-base-cased/added_tokens.json',
 'tokenizers/bert-base-cased/tokenizer.json')

In [28]:
!ls tokenizers/bert-base-cased

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
special_tokens_map.json tokenizer_config.json
tokenizer.json          vocab.txt


In [24]:
fc.Path('tokenizers/bert-base-cased/tokenizer_config.json').read_json()

{'clean_up_tokenization_spaces': True,
 'cls_token': '[CLS]',
 'do_lower_case': False,
 'mask_token': '[MASK]',
 'model_max_length': 512,
 'pad_token': '[PAD]',
 'sep_token': '[SEP]',
 'strip_accents': None,
 'tokenize_chinese_chars': True,
 'tokenizer_class': 'BertTokenizer',
 'unk_token': '[UNK]'}

In [25]:
fc.Path('tokenizers/bert-base-cased/special_tokens_map.json').read_json()

{'cls_token': '[CLS]',
 'mask_token': '[MASK]',
 'pad_token': '[PAD]',
 'sep_token': '[SEP]',
 'unk_token': '[UNK]'}

## Encoding
Translating text to numbers
> 
> Two step process
> > Tokenization
> > 
> > Convertion to Input Ids

### Tokenize

In [34]:
from transformers import AutoTokenizer

In [35]:
tokenizer = AutoTokenizer.from_pretrained('bert-base-cased')

In [40]:
tokens = tokenizer.tokenize(haiku)

In [41]:
print(tokens)

### Conversion to Input Ids

In [43]:
ids = tokenizer.convert_tokens_to_ids(tokens)

In [44]:
print(ids)

### Prepare tokenizer for model

In [46]:
final_inputs = tokenizer.prepare_for_model(ids)

In [50]:
assert output1.input_ids == output2.input_ids == final_inputs.input_ids

## Decoding

In [52]:
tokenizer.decode(final_inputs.input_ids)

'[CLS] An old silent pond A frog jumps into the pond — Splash! Silence again. [SEP]'

In [53]:
tokenizer.decode(ids)

'An old silent pond A frog jumps into the pond — Splash! Silence again.'