# L6 - Building and Training a Tokenizer

In [8]:
!pip install datasets



In [9]:
from datasets import load_dataset

In [10]:
bc = load_dataset("bookcorpus", split="train", trust_remote_code=True)

In [12]:
num_samples=6
for idx, sample in enumerate(bc[0:num_samples]['text']):
  print(f'{idx}: {sample}')

0: usually , he would be tearing around the living room , playing with his toys .
1: but just one look at a minion sent him practically catatonic .
2: that had been megan 's plan when she got him dressed earlier .
3: he 'd seen the movie almost by mistake , considering he was a little young for the pg cartoon , but with older cousins , along with her brothers , mason was often exposed to things that were older .
4: she liked to think being surrounded by adults and older kids was one reason why he was a such a good talker for his age .
5: `` are n't you being a good boy ? ''


In [17]:
from tokenizers import Tokenizer
from tokenizers.normalizers import Lowercase
from tokenizers.pre_tokenizers import Whitespace
from tokenizers.models import BPE

In [18]:
model = BPE(unk_token="[UNK]")
tokenizer = Tokenizer(model)

In [19]:
tokenizer.normalizer = Lowercase()
tokenizer.pre_tokenizer = Whitespace()

In [20]:
from tokenizers.trainers import BpeTrainer
trainer = BpeTrainer(vocab_size=32000, special_tokens=["[PAD]", "[UNK]"], continuing_subword_prefix="##")

In [21]:
def get_examples(batch_size=1000):
  for i in range(0, len(bc), batch_size):
    yield bc[i : i + batch_size]['text']

In [22]:
from multiprocessing import cpu_count
print(cpu_count())

2


In [23]:
tokenizer.train_from_iterator(get_examples(batch_size=10000), trainer = trainer, length=len(bc))

In [25]:
tokenizer.model.save('model', prefix='hopper')

['model/hopper-vocab.json', 'model/hopper-merges.txt']

In [26]:
with open('model/hopper-merges.txt', 'r') as file:
  row = 0
  num_lines = 10
  for line in file.readlines():
    print(line)
    row+=1
    if row >= num_lines:
      break

#version: 0.2

##h ##e

t ##he

##i ##n

##e ##r

##e ##d

##o ##u

##n ##d

##in ##g

t ##o



In [27]:
with open('model/hopper-merges.txt', 'r') as file:
  row = 0
  num_lines = 10
  for line in reversed(file.readlines()):
    print(line)
    row+=1
    if row >= num_lines:
      break

mel ##anthe

black ##er

ad ##ject

v ##ang

betroth ##al

tiptoe ##ing

restroom ##s

consol ##ing

esp ##ionage

influ ##x



In [29]:
with open('model/hopper-merges.txt', 'r') as file:
  lines = file.readlines()

print(f'The total number of merges are - {len(lines)}')
print(f'vocab size is - {tokenizer.get_vocab_size()}')

The total number of merges are - 31871
vocab size is - 32000


# L7 - Encoder and Decoder

In [30]:
sample = bc[0]['text']
print(f'sample: {sample}')
encoding = tokenizer.encode(sample)
print(encoding)

sample: usually , he would be tearing around the living room , playing with his toys .
Encoding(num_tokens=16, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing])


In [31]:
token_ids = encoding.ids
tokens = encoding.tokens
type_ids = encoding.type_ids
attention_mask = encoding.attention_mask

In [32]:
from tokenizers.tools import EncodingVisualizer
visualizer = EncodingVisualizer(tokenizer=tokenizer)
visualizer(text=sample)

In [35]:
import pandas as pd
outdict = {'tokens': tokens, 'ids': token_ids, 'type_id': type_ids, 'attention_mask': attention_mask}
df = pd.DataFrame.from_dict(outdict)
df

Unnamed: 0,tokens,ids,type_id,attention_mask
0,usually,2462,0,1
1,",",19,0,1
2,he,149,0,1
3,would,277,0,1
4,be,162,0,1
5,tearing,6456,0,1
6,around,422,0,1
7,the,131,0,1
8,living,1559,0,1
9,room,536,0,1


In [39]:
from pprint import pprint
samples = bc[0:4]['text']
batch_enc = tokenizer.encode_batch(samples)
pprint(batch_enc)

[Encoding(num_tokens=16, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing]),
 Encoding(num_tokens=14, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing]),
 Encoding(num_tokens=14, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing]),
 Encoding(num_tokens=42, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing])]


In [41]:
tokenizer.enable_padding(direction='right', pad_id=0, pad_type_id=0, pad_token='[PAD]', length=None, pad_to_multiple_of=None)
tokenizer.enable_truncation(max_length=512)
batch_enc = tokenizer.encode_batch(samples)
pprint(batch_enc)

[Encoding(num_tokens=42, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing]),
 Encoding(num_tokens=42, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing]),
 Encoding(num_tokens=42, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing]),
 Encoding(num_tokens=42, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing])]


In [42]:
text = "aayi nayi wala song sunn rha hun."
encoded = tokenizer.encode(text).tokens
print(encoded)

['a', '##ay', '##i', 'nay', '##i', 'wal', '##a', 'song', 'sun', '##n', 'r', '##ha', 'hun', '.']


In [43]:
tokenizer.save('hopper.json')

In [44]:
import json
with open('hopper.json', 'r') as file:
  json_data = json.load(file)

In [47]:
pprint(json_data, depth=1)

{'added_tokens': [...],
 'decoder': None,
 'model': {...},
 'normalizer': {...},
 'padding': {...},
 'post_processor': None,
 'pre_tokenizer': {...},
 'truncation': {...},
 'version': '1.0'}


In [48]:
trained = Tokenizer(BPE())
trainer = trained.from_file('hopper.json')
tokens = trained.encode(text).tokens
print(tokens)

[]


In [49]:
bert = Tokenizer(BPE(unk_token='[UNK]'))
bert.normalizer = Lowercase()
bert.pre_tokenizer = Whitespace()
berttrain = BpeTrainer(vocab_size=32000, special_tokens=["[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]"], continuing_subword_prefix="##")


In [50]:
from tokenizers.processors import TemplateProcessing

In [51]:
bert.post_processor = TemplateProcessing()

# L8 - Wrap it with PreTrainedTokenizer

In [52]:
encoding = tokenizer.encode(text)
print(encoding)

Encoding(num_tokens=14, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing])


In [53]:
from transformers import PreTrainedTokenizerFast

pt = PreTrainedTokenizerFast(tokenizer_file='hopper.json', unk_token='[UNK]', pad_token='[PAD]', model_input_names=["input_ids", "token_type_ids", "attention_mask"],)

In [54]:
model_inputs = pt(text)
pprint(model_inputs, compact=True)

{'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
 'input_ids': [46, 181, 82, 11951, 82, 563, 81, 2984, 1239, 85, 63, 1901, 2087,
               21],
 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]}
