# BERT tokenizer

The main advantage of a subword tokenizer is that is interpolates between word-based on character-based tokenization. Common words get a slot in the vocab, but the tokenizer can also fall back to word peices and individual characters for unknown words


# Setup

In [1]:
import collections
import os
import pathlib
import re
import string
import sys
import tempfile
import time

import numpy as np
import matplotlib.pyplot as plt

import tensorflow_text as text
import tensorflow as tf
from dataset import get_data
from tensorflow_text.tools.wordpiece_vocab import bert_vocab_from_dataset as bert_vocab

AssertionError: Duplicate registrations for type 'experimentalOptimizer'

In [12]:
tf.get_logger().setLevel('ERROR')
pwd = pathlib.Path.cwd()

# Get the dataset

In [17]:
train_questions = tf.data.TextLineDataset('trainquestions.csv')
train_answers = tf.data.TextLineDataset('trainanswers.csv')

for line in train_questions.take(1):
        print(line.numpy().decode('utf-8'))

for line in train_answers.take(1):
        print(line.numpy().decode('utf-8'))


Did you hear about the Native American man that drank 200 cups of tea?
He nearly drown in his own tea pee.


# Generate the vocabulary

In [18]:
bert_tokenizer_params=dict(lower_case=True) #is it tho?
reserved_tokens=["[PAD]", "[UNK]", "[START]", "[END]"]

bert_vocab_args = dict(
    # The target vocabulary size
    vocab_size = 8000,
    # Reserved tokens that must be included in the vocabulary
    reserved_tokens=reserved_tokens,
    # Arguments for `text.BertTokenizer`
    bert_tokenizer_params=bert_tokenizer_params,
    # Arguments for `wordpiece_vocab.wordpiece_tokenizer_learner_lib.learn`
    learn_params={},
)

In [19]:
%%time
questions_vocab = bert_vocab.bert_vocab_from_dataset(
    train_questions.batch(1000).prefetch(2),
    **bert_vocab_args
)

CPU times: user 35.9 s, sys: 329 ms, total: 36.3 s
Wall time: 36.2 s


In [20]:
print(questions_vocab[:10])
print(questions_vocab[100:110])
print(questions_vocab[1000:1010])
print(questions_vocab[-10:])

['[PAD]', '[UNK]', '[START]', '[END]', '!', '"', '#', '$', '%', '&']
['between', 'say', 'have', 'difference', 'with', 'it', 'an', 'who', 'hear', 'get']
['connery', 'cop', 'dealer', 'depressed', 'elephants', 'lion', 'named', 'oral', 'stairs', 'those']
['##ı', '##ε', '##ζ', '##–', '##—', '##’', '##“', '##”', '##€', '##√']


In [21]:
#Make a vocabulary file
def write_vocab_file(filepath, vocab):
  with open(filepath, 'w') as f:
    for token in vocab:
      print(token, file=f)

In [22]:
write_vocab_file('questions_vocab.txt', questions_vocab)


In [24]:
%%time
answers_vocab = bert_vocab.bert_vocab_from_dataset(
    train_answers.batch(1000).prefetch(2),
    **bert_vocab_args
)

CPU times: user 49.3 s, sys: 508 ms, total: 49.8 s
Wall time: 49.8 s


In [25]:
write_vocab_file('answers_vocab.txt', answers_vocab)


In [26]:
print(answers_vocab[:10])
print(answers_vocab[100:110])
print(answers_vocab[1000:1010])
print(answers_vocab[-10:])

['[PAD]', '[UNK]', '[START]', '[END]', '!', '"', '#', '$', '%', '&']
['р', 'с', 'т', 'ь', 'ॐ', '౪', 'ಠ', 'ᄈ', 'ᅡ', 'ᆼ']
['stays', 'telling', 'truck', '##9', '##ick', '##op', '##use', '##ze', 'ant', 'bach']
['##😀', '##😂', '##😃', '##😆', '##😏', '##😔', '##😜', '##😳', '##🙇', '##🤘']


# Build the tokenizer

The text.BertTokenizer can be initialize by passing the vocabulary file path as the first argument.



In [28]:
question_tokenizer = text.BertTokenizer('questions_vocab.txt', **bert_tokenizer_params)
answer_tokenizer = text.BertTokenizer('answers_vocab.txt', **bert_tokenizer_params)

In [31]:
for answers in train_answers.batch(3).take(1):
  for a in answers:
    print(a.numpy())

b'He nearly drown in his own tea pee.'
b'Mycheexarphlexin'
b'Matt'


In [32]:
# Tokenize the examples -> (batch, word, word-piece)
token_batch = question_tokenizer.tokenize(answers)
# Merge the word and word-piece axes -> (batch, tokens)
token_batch = token_batch.merge_dims(-2,-1)

for ex in token_batch.to_list():
  print(ex)

[124, 2342, 203, 1445, 92, 111, 640, 575, 1465, 17]
[174, 1490, 3357, 561, 2730, 344, 616, 290]
[52, 485, 152]


In [34]:
# Lookup each token id in the vocabulary.
txt_tokens = tf.gather(questions_vocab, token_batch)
# Join with spaces.
tf.strings.reduce_join(txt_tokens, separator=' ', axis=-1)

<tf.Tensor: shape=(3,), dtype=string, numpy=
array([b'he near ##ly drown in his own tea pee .',
       b'my ##che ##ex ##ar ##ph ##le ##x ##in', b'm ##at ##t'],
      dtype=object)>

In [37]:
words = question_tokenizer.detokenize(token_batch)
tf.strings.reduce_join(words, separator=' ', axis=-1)

<tf.Tensor: shape=(3,), dtype=string, numpy=
array([b'he nearly drown in his own tea pee .', b'mycheexarphlexin',
       b'matt'], dtype=object)>

# Custumization and export

The reserved_tokens reserve space at the beginning of the vocabulary, so [START] and [END] have the same indexes for questions and answers.

In [38]:
START = tf.argmax(tf.constant(reserved_tokens) == "[START]")
END = tf.argmax(tf.constant(reserved_tokens) == "[END]")

def add_start_end(ragged):
  count = ragged.bounding_shape()[0]
  starts = tf.fill([count,1], START)
  ends = tf.fill([count,1], END)
  return tf.concat([starts, ragged, ends], axis=1)

In [41]:
words = question_tokenizer.detokenize(add_start_end(token_batch))
tf.strings.reduce_join(words, separator=' ', axis=-1)

<tf.Tensor: shape=(3,), dtype=string, numpy=
array([b'[START] he nearly drown in his own tea pee . [END]',
       b'[START] mycheexarphlexin [END]', b'[START] matt [END]'],
      dtype=object)>

In [44]:
def cleanup_text(reserved_tokens, token_txt):
  # Drop the reserved tokens, except for "[UNK]".
  bad_tokens = [re.escape(tok) for tok in reserved_tokens if tok != "[UNK]"]
  bad_token_re = "|".join(bad_tokens)

  bad_cells = tf.strings.regex_full_match(token_txt, bad_token_re)
  result = tf.ragged.boolean_mask(token_txt, ~bad_cells)

  # Join them into strings.
  result = tf.strings.reduce_join(result, separator=' ', axis=-1)

  return result

## Export


In [45]:
class CustomTokenizer(tf.Module):
  def __init__(self, reserved_tokens, vocab_path):
    self.tokenizer = text.BertTokenizer(vocab_path, lower_case=True)
    self._reserved_tokens = reserved_tokens
    self._vocab_path = tf.saved_model.Asset(vocab_path)

    vocab = pathlib.Path(vocab_path).read_text().splitlines()
    self.vocab = tf.Variable(vocab)

    ## Create the signatures for export:   

    # Include a tokenize signature for a batch of strings. 
    self.tokenize.get_concrete_function(
        tf.TensorSpec(shape=[None], dtype=tf.string))

    # Include `detokenize` and `lookup` signatures for:
    #   * `Tensors` with shapes [tokens] and [batch, tokens]
    #   * `RaggedTensors` with shape [batch, tokens]
    self.detokenize.get_concrete_function(
        tf.TensorSpec(shape=[None, None], dtype=tf.int64))
    self.detokenize.get_concrete_function(
          tf.RaggedTensorSpec(shape=[None, None], dtype=tf.int64))

    self.lookup.get_concrete_function(
        tf.TensorSpec(shape=[None, None], dtype=tf.int64))
    self.lookup.get_concrete_function(
          tf.RaggedTensorSpec(shape=[None, None], dtype=tf.int64))

    # These `get_*` methods take no arguments
    self.get_vocab_size.get_concrete_function()
    self.get_vocab_path.get_concrete_function()
    self.get_reserved_tokens.get_concrete_function()

  @tf.function
  def tokenize(self, strings):
    enc = self.tokenizer.tokenize(strings)
    # Merge the `word` and `word-piece` axes.
    enc = enc.merge_dims(-2,-1)
    enc = add_start_end(enc)
    return enc

  @tf.function
  def detokenize(self, tokenized):
    words = self.tokenizer.detokenize(tokenized)
    return cleanup_text(self._reserved_tokens, words)

  @tf.function
  def lookup(self, token_ids):
    return tf.gather(self.vocab, token_ids)

  @tf.function
  def get_vocab_size(self):
    return tf.shape(self.vocab)[0]

  @tf.function
  def get_vocab_path(self):
    return self._vocab_path

  @tf.function
  def get_reserved_tokens(self):
    return tf.constant(self._reserved_tokens)

In [46]:
tokenizers = tf.Module()
tokenizers.question = CustomTokenizer(reserved_tokens, 'questions_vocab.txt')
tokenizers.answer = CustomTokenizer(reserved_tokens, 'answers_vocab.txt')

In [47]:
model_name = 'BERT_tokenizer_questions_answers'
tf.saved_model.save(tokenizers, model_name)

2022-02-24 21:09:53.673615: W tensorflow/python/util/util.cc:368] Sets are not currently considered sequences, but this may change in the future, so consider avoiding using them.


Cause: could not parse the source code of <function _trace_resource_initializers.<locals>._wrap_obj_initializer.<locals>.<lambda> at 0x154713d40>: no matching AST found among candidates:
# coding=utf-8
(lambda : _wrap_initializer(obj))
Cause: could not parse the source code of <function _trace_resource_initializers.<locals>._wrap_obj_initializer.<locals>.<lambda> at 0x154713f80>: no matching AST found among candidates:
# coding=utf-8
(lambda : _wrap_initializer(obj))


In [48]:
model_name = 'BERT_tokenizer_questions_answers'
tf.saved_model.save(tokenizers, model_name)

Cause: could not parse the source code of <function _trace_resource_initializers.<locals>._wrap_obj_initializer.<locals>.<lambda> at 0x1546c97a0>: no matching AST found among candidates:
# coding=utf-8
(lambda : _wrap_initializer(obj))
Cause: could not parse the source code of <function _trace_resource_initializers.<locals>._wrap_obj_initializer.<locals>.<lambda> at 0x1546c9680>: no matching AST found among candidates:
# coding=utf-8
(lambda : _wrap_initializer(obj))


In [51]:
reloaded_tokenizers = tf.saved_model.load(model_name)
reloaded_tokenizers.question.get_vocab_size().numpy()


3659

In [52]:
tokens = reloaded_tokenizers.question.tokenize(['Hello TensorFlow!'])
tokens.numpy()

array([[   2, 2330, 1667,   94,  443,  615, 1850,    4,    3]])

In [53]:
text_tokens = reloaded_tokenizers.question.lookup(tokens)
text_tokens

<tf.RaggedTensor [[b'[START]', b'hello', b'ten', b'##s', b'##or', b'##f', b'##low', b'!',
  b'[END]']]>

In [56]:
round_trip = reloaded_tokenizers.question.detokenize(tokens)

print(round_trip.numpy()[0].decode('utf-8'))

hello tensorflow !
