# BERT tokenizer

The main advantage of a subword tokenizer is that is interpolates between word-based on character-based tokenization. Common words get a slot in the vocab, but the tokenizer can also fall back to word peices and individual characters for unknown words


# Setup

In [4]:
import collections
import os
import pathlib
import re
import string
import sys
import tempfile
import time

import numpy as np
import matplotlib.pyplot as plt
import tensorflow_datasets as tfds


import tensorflow_text as text
import tensorflow as tf
from dataset import get_data
from tensorflow_text.tools.wordpiece_vocab import bert_vocab_from_dataset as bert_vocab

In [2]:
tf.get_logger().setLevel('ERROR')
pwd = pathlib.Path.cwd()

# Get the dataset

In [9]:
examples, metadata = tfds.load('ted_hrlr_translate/ru_to_en', with_info=True, as_supervised=True)
train_examples, val_examples = examples['train'], examples['validation']

In [23]:
for ru, en in train_examples.take(1):
  print("Portuguese: ", ru.numpy().decode('utf-8'))
  print("English:   ", en.numpy().decode('utf-8'))


Portuguese:  к : успех , перемены возможны только с оружием в руках .
English:    c : success , the change is only coming through the barrel of the gun .


2022-02-28 10:32:22.472174: W tensorflow/core/kernels/data/cache_dataset_ops.cc:768] The calling iterator did not fully read the dataset being cached. In order to avoid unexpected truncation of the dataset, the partially cached contents of the dataset  will be discarded. This can happen if you have an input pipeline similar to `dataset.cache().take(k).repeat()`. You should use `dataset.take(k).cache().repeat()` instead.


In [25]:
train_en = train_examples.map(lambda ru, en: en)
train_ru = train_examples.map(lambda ru, en: ru)

# Generate the vocabulary

In [26]:
bert_tokenizer_params=dict(lower_case=True) 
reserved_tokens=["[PAD]", "[UNK]", "[START]", "[END]"]

bert_vocab_args = dict(
    # The target vocabulary size
    vocab_size = 8000,
    # Reserved tokens that must be included in the vocabulary
    reserved_tokens=reserved_tokens,
    # Arguments for `text.BertTokenizer`
    bert_tokenizer_params=bert_tokenizer_params,
    # Arguments for `wordpiece_vocab.wordpiece_tokenizer_learner_lib.learn`
    learn_params={},
)

In [29]:
%%time
ru_vocab = bert_vocab.bert_vocab_from_dataset(
    train_ru.batch(1000).prefetch(2),
    **bert_vocab_args
)

CPU times: user 12min 17s, sys: 19.2 s, total: 12min 36s
Wall time: 12min 10s


In [30]:
print(ru_vocab[:10])
print(ru_vocab[100:110])
print(ru_vocab[1000:1010])
print(ru_vocab[-10:])

['[PAD]', '[UNK]', '[START]', '[END]', '!', '#', '$', '%', '&', "'"]
['ш', 'щ', 'ъ', 'ы', 'ь', 'э', 'ю', 'я', 'і', '՛']
['трудно', 'хотела', 'далеко', 'качестве', 'мою', '##3', '##де', '##ила', 'планеты', 'большие']
['##’', '##“', '##”', '##„', '##•', '##′', '##⁄', '##∇', '##♪', '##♫']


In [31]:
#Make a vocabulary file
def write_vocab_file(filepath, vocab):
  with open(filepath, 'w') as f:
    for token in vocab:
      print(token, file=f)

In [32]:
write_vocab_file('ru_vocab.txt', ru_vocab)


In [33]:
%%time
en_vocab = bert_vocab.bert_vocab_from_dataset(
    train_en.batch(1000).prefetch(2),
    **bert_vocab_args
)

CPU times: user 2min 56s, sys: 5.19 s, total: 3min 2s
Wall time: 2min 52s


In [37]:
write_vocab_file('en_vocab.txt', en_vocab)


In [35]:
print(en_vocab[:10])
print(en_vocab[100:110])
print(en_vocab[1000:1010])
print(en_vocab[-10:])

['[PAD]', '[UNK]', '[START]', '[END]', '!', '#', '$', '%', '&', "'"]
['##s', 'have', 'but', 'what', 'on', 'do', 'with', 'can', 'there', 'about']
['revolution', '200', 'basic', 'potential', 'english', 'led', 'message', 'perfect', '##ce', 'nine']
['##–', '##—', '##‘', '##’', '##“', '##”', '##•', '##∇', '##♪', '##♫']


# Build the tokenizer

The text.BertTokenizer can be initialize by passing the vocabulary file path as the first argument.



In [38]:
ru_tokenizer = text.BertTokenizer('ru_vocab.txt', **bert_tokenizer_params)
en_tokenizer = text.BertTokenizer('en_vocab.txt', **bert_tokenizer_params)

In [46]:
for english in train_en.batch(3).take(1):
  for e in english:
    print(e.numpy().decode('utf-8'))

c : success , the change is only coming through the barrel of the gun .
the documentation and the hands-on teaching methodology is also open-source and released as the creative commons .
( video ) didi pickles : it 's four o'clock in the morning .


In [47]:
# Tokenize the examples -> (batch, word, word-piece)
token_batch = ru_tokenizer.tokenize(english)
# Merge the word and word-piece axes -> (batch, tokens)
token_batch = token_batch.merge_dims(-2,-1)

for ex in token_batch.to_list():
  print(ex)

[43, 28, 59, 6508, 2211, 5051, 539, 539, 14, 3269, 43, 6115, 4341, 2946, 649, 49, 539, 55, 2369, 1461, 1215, 4858, 3493, 60, 6115, 1545, 832, 6508, 2946, 6115, 3269, 42, 3810, 5947, 1461, 55, 4342, 3269, 47, 6508, 2369, 16]
[3269, 44, 832, 2211, 6508, 7013, 2369, 830, 770, 5876, 41, 2369, 748, 3269, 48, 4341, 748, 539, 15, 55, 2369, 60, 649, 770, 4720, 3493, 53, 5511, 6115, 832, 748, 832, 1461, 832, 2946, 1215, 49, 539, 41, 1461, 539, 832, 55, 1994, 7012, 15, 59, 832, 6508, 1545, 5051, 41, 2369, 748, 58, 649, 3381, 770, 7477, 748, 41, 539, 3269, 43, 5947, 770, 830, 969, 4343, 4858, 1671, 3028, 539, 16]
[10, 62, 969, 748, 649, 832, 11, 44, 969, 748, 969, 56, 4986, 2091, 3381, 539, 28, 49, 830, 9, 59, 46, 832, 6508, 1545, 55, 9, 43, 1461, 832, 6299, 49, 2369, 3269, 53, 7476, 2369, 3493, 16]


In [48]:
# Lookup each token id in the vocabulary.
txt_tokens = tf.gather(en_vocab, token_batch)
# Join with spaces.
tf.strings.reduce_join(txt_tokens, separator=' ', axis=-1)

<tf.Tensor: shape=(3,), dtype=string, numpy=
array([b'e : u ##ents covered ##well ##ion ##ion , queen e ##ace computation ##ological ted k ##ion q messages plastic americans customer islam v ##ace prison lab ##ents ##ological ##ace queen d permission dough plastic q deserve queen i ##ents messages .',
       b'queen f lab covered ##ents shoe messages tools spent punishment c messages blue queen j computation blue ##ion - q messages v ted spent disabled islam o syndrome ##ace lab blue lab plastic lab ##ological americans k ##ion c plastic ##ion lab q continent settled - u lab ##ents prison ##well c messages blue t ted pool spent exponentially blue c ##ion queen e dough spent tools ##et drawings customer possibly laptop ##ion .',
       b"( x ##et blue ted lab ) f ##et blue ##et r 1950s processes pool ##ion : k tools ' u h lab ##ents prison q ' e plastic lab dictionaries k messages queen o exploded messages islam ."],
      dtype=object)>

In [49]:
words = en_tokenizer.detokenize(token_batch)
tf.strings.reduce_join(words, separator=' ', axis=-1)

<tf.Tensor: shape=(3,), dtype=string, numpy=
array([b'e : uents coveredwellionion , queen eace computationological ted kion q messages plastic americans customer islam vace prison labentsologicalace queen d permission dough plastic q deserve queen ients messages .',
       b'queen f lab coveredents shoe messages tools spent punishment c messages blue queen j computation blueion - q messages v ted spent disabled islam o syndromeace lab blue lab plastic labological americans kion c plasticion lab q continent settled - u labents prisonwell c messages blue t ted pool spent exponentially blue cion queen e dough spent toolset drawings customer possibly laptopion .',
       b"( xet blue ted lab ) fet blueet r 1950s processes poolion : k tools ' u h labents prison q ' e plastic lab dictionaries k messages queen o exploded messages islam ."],
      dtype=object)>

# Custumization and export

The reserved_tokens reserve space at the beginning of the vocabulary, so [START] and [END] have the same indexes for questions and answers.

In [50]:
START = tf.argmax(tf.constant(reserved_tokens) == "[START]")
END = tf.argmax(tf.constant(reserved_tokens) == "[END]")

def add_start_end(ragged):
  count = ragged.bounding_shape()[0]
  starts = tf.fill([count,1], START)
  ends = tf.fill([count,1], END)
  return tf.concat([starts, ragged, ends], axis=1)

In [51]:
words = ru_tokenizer.detokenize(add_start_end(token_batch))
tf.strings.reduce_join(words, separator=' ', axis=-1)

<tf.Tensor: shape=(3,), dtype=string, numpy=
array([b'[START] c : success , the change is only coming through the barrel of the gun . [END]',
       b'[START] the documentation and the hands - on teaching methodology is also open - source and released as the creative commons . [END]',
       b"[START] ( video ) didi pickles : it ' s four o ' clock in the morning . [END]"],
      dtype=object)>

In [52]:
def cleanup_text(reserved_tokens, token_txt):
  # Drop the reserved tokens, except for "[UNK]".
  bad_tokens = [re.escape(tok) for tok in reserved_tokens if tok != "[UNK]"]
  bad_token_re = "|".join(bad_tokens)

  bad_cells = tf.strings.regex_full_match(token_txt, bad_token_re)
  result = tf.ragged.boolean_mask(token_txt, ~bad_cells)

  # Join them into strings.
  result = tf.strings.reduce_join(result, separator=' ', axis=-1)

  return result

## Export


In [53]:
class CustomTokenizer(tf.Module):
  def __init__(self, reserved_tokens, vocab_path):
    self.tokenizer = text.BertTokenizer(vocab_path, lower_case=True)
    self._reserved_tokens = reserved_tokens
    self._vocab_path = tf.saved_model.Asset(vocab_path)

    vocab = pathlib.Path(vocab_path).read_text().splitlines()
    self.vocab = tf.Variable(vocab)

    ## Create the signatures for export:   

    # Include a tokenize signature for a batch of strings. 
    self.tokenize.get_concrete_function(
        tf.TensorSpec(shape=[None], dtype=tf.string))

    # Include `detokenize` and `lookup` signatures for:
    #   * `Tensors` with shapes [tokens] and [batch, tokens]
    #   * `RaggedTensors` with shape [batch, tokens]
    self.detokenize.get_concrete_function(
        tf.TensorSpec(shape=[None, None], dtype=tf.int64))
    self.detokenize.get_concrete_function(
          tf.RaggedTensorSpec(shape=[None, None], dtype=tf.int64))

    self.lookup.get_concrete_function(
        tf.TensorSpec(shape=[None, None], dtype=tf.int64))
    self.lookup.get_concrete_function(
          tf.RaggedTensorSpec(shape=[None, None], dtype=tf.int64))

    # These `get_*` methods take no arguments
    self.get_vocab_size.get_concrete_function()
    self.get_vocab_path.get_concrete_function()
    self.get_reserved_tokens.get_concrete_function()

  @tf.function
  def tokenize(self, strings):
    enc = self.tokenizer.tokenize(strings)
    # Merge the `word` and `word-piece` axes.
    enc = enc.merge_dims(-2,-1)
    enc = add_start_end(enc)
    return enc

  @tf.function
  def detokenize(self, tokenized):
    words = self.tokenizer.detokenize(tokenized)
    return cleanup_text(self._reserved_tokens, words)

  @tf.function
  def lookup(self, token_ids):
    return tf.gather(self.vocab, token_ids)

  @tf.function
  def get_vocab_size(self):
    return tf.shape(self.vocab)[0]

  @tf.function
  def get_vocab_path(self):
    return self._vocab_path

  @tf.function
  def get_reserved_tokens(self):
    return tf.constant(self._reserved_tokens)

In [54]:
tokenizers = tf.Module()
tokenizers.ru = CustomTokenizer(reserved_tokens, 'ru_vocab.txt')
tokenizers.en = CustomTokenizer(reserved_tokens, 'ru_vocab.txt')

In [55]:
model_name = 'BERT_tokenizer_ru_en'
tf.saved_model.save(tokenizers, model_name)

2022-02-28 10:57:15.824300: W tensorflow/python/util/util.cc:368] Sets are not currently considered sequences, but this may change in the future, so consider avoiding using them.


In [57]:
reloaded_tokenizers = tf.saved_model.load(model_name)
reloaded_tokenizers.en.get_vocab_size().numpy()


7832

In [58]:
tokens = reloaded_tokenizers.en.tokenize(['Hello TensorFlow!'])
tokens.numpy()

array([[   2,   48,  649, 1461, 1461,  832,   60, 7012,  539, 7476, 4342,
        1461,  832, 4579,    4,    3]])

In [59]:
text_tokens = reloaded_tokenizers.en.lookup(tokens)
text_tokens

<tf.RaggedTensor [[b'[START]', b'h', b'##e', b'##l', b'##l', b'##o', b't', b'##en', b'##s',
  b'##or', b'##f', b'##l', b'##o', b'##w', b'!', b'[END]']]>

In [60]:
round_trip = reloaded_tokenizers.en.detokenize(tokens)

print(round_trip.numpy()[0].decode('utf-8'))

hello tensorflow !
