## All together: a BERT tokenizer from scratch

In [11]:
# First, BERT relies on WordPiece, so we instantiate a new Tokenizer with this model:

from tokenizers import Tokenizer
from tokenizers.models import WordPiece

bert_tokenizer = Tokenizer(WordPiece(unk_token="[UNK]"))

In [12]:
# Then we know that BERT preprocesses texts by removing accents and lowercasing. We also use a unicode normalizer:
from tokenizers import normalizers
from tokenizers.normalizers import Lowercase, NFD, StripAccents

bert_tokenizer.normalizer = normalizers.Sequence([NFD(), Lowercase(), StripAccents()])

In [13]:
# The pre-tokenizer is just splitting on whitespace and punctuation:
from tokenizers.pre_tokenizers import Whitespace

bert_tokenizer.pre_tokenizer = Whitespace()

In [14]:
# And the post-processing uses the template we saw in the previous section:
from tokenizers.processors import TemplateProcessing

bert_tokenizer.post_processor = TemplateProcessing(
    single="[CLS] $A [SEP]",
    pair="[CLS] $A [SEP] $B:1 [SEP]:1",
    special_tokens=[
        ("[CLS]", 1),
        ("[SEP]", 2),
    ],
)

In [15]:
# We can use this tokenizer and train on it on wikitext like in the quicktour:

from tokenizers.trainers import WordPieceTrainer
trainer = WordPieceTrainer(vocab_size=30522, special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"])
files = ['bgl_log_content.txt']
bert_tokenizer.train(files, trainer)
bert_tokenizer.save("data/bert-bgl.json")






### Decoding

In [1]:
from tokenizers import Tokenizer

tokenizer = Tokenizer.from_file("data/bert-bgl.json")

In [2]:
output = tokenizer.encode("RAS KERNEL INFO total of 1 ddr error(s) detected and corrected")
output

Encoding(num_tokens=16, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing])

In [3]:
print(output.ids)

[1, 122, 126, 128, 491, 334, 21, 336, 193, 12, 57, 13, 214, 280, 285, 2]


In [4]:
tokenizer.decode(output.ids)

'ras kernel info total of 1 ddr error ( s ) detected and corrected'

In [6]:
print(output.tokens)

['[CLS]', 'ras', 'kernel', 'info', 'total', 'of', '1', 'ddr', 'error', '(', 's', ')', 'detected', 'and', 'corrected', '[SEP]']
