In [1]:
! pip install  protobuf>=3.20


In [2]:
! pip install -q -U "tensorflow-text"
! pip install -q tensorflow_datasets


[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.5/6.5 MB[0m [31m23.3 MB/s[0m eta [36m0:00:00[0m
[?25h

In [3]:
import collections
import os
import pathlib
import re
import string
import numpy as np
import tensorflow_datasets as tfds
import tensorflow_text as text
import tensorflow as tf

### load the dataset

In [4]:
import tensorflow_datasets as tfds

examples, metadata = tfds.load( 'ted_hrlr_translate/ru_to_en',
                                 with_info=True,
                                 as_supervised=True)

train_examples, val_examples = examples['train'], examples['validation']


Downloading and preparing dataset 124.94 MiB (download: 124.94 MiB, generated: Unknown size, total: 124.94 MiB) to /root/tensorflow_datasets/ted_hrlr_translate/ru_to_en/1.0.0...


Dl Completed...: 0 url [00:00, ? url/s]

Dl Size...: 0 MiB [00:00, ? MiB/s]

Extraction completed...: 0 file [00:00, ? file/s]

Generating splits...:   0%|          | 0/3 [00:00<?, ? splits/s]

Generating train examples...:   0%|          | 0/208106 [00:00<?, ? examples/s]

Shuffling /root/tensorflow_datasets/ted_hrlr_translate/ru_to_en/1.0.0.incompleteZ7KW8Z/ted_hrlr_translate-trai…

Generating validation examples...:   0%|          | 0/4805 [00:00<?, ? examples/s]

Shuffling /root/tensorflow_datasets/ted_hrlr_translate/ru_to_en/1.0.0.incompleteZ7KW8Z/ted_hrlr_translate-vali…

Generating test examples...:   0%|          | 0/5476 [00:00<?, ? examples/s]

Shuffling /root/tensorflow_datasets/ted_hrlr_translate/ru_to_en/1.0.0.incompleteZ7KW8Z/ted_hrlr_translate-test…

Dataset ted_hrlr_translate downloaded and prepared to /root/tensorflow_datasets/ted_hrlr_translate/ru_to_en/1.0.0. Subsequent calls will reuse this data.


In [5]:
for russian, english in train_examples.take(1):
  print("russian: ", russian.numpy().decode('utf-8'))
  print("English:   ", english.numpy().decode('utf-8'))


russian:  к : успех , перемены возможны только с оружием в руках .
English:    c : success , the change is only coming through the barrel of the gun .


In [6]:
train_examples

<_PrefetchDataset element_spec=(TensorSpec(shape=(), dtype=tf.string, name=None), TensorSpec(shape=(), dtype=tf.string, name=None))>

### construct training sets

In [7]:
train_english = train_examples.map(lambda russian, english: english )
train_russian = train_examples.map(lambda russian, english: russian )


In [8]:
train_english.batch(1000).prefetch(2)

<_PrefetchDataset element_spec=TensorSpec(shape=(None,), dtype=tf.string, name=None)>

#### generate the vocabularies for Russian and English

In [9]:
from tensorflow_text.tools.wordpiece_vocab import bert_vocab_from_dataset as bert_vocab

# we will consider some of special token such as padding , unknown vocabulary,
#start and end of the sequence in our argument dictionary , These tokens are important for BERT to handle various tasks like padding sequences and indicating the start and end of sentences.
special_tokens=["[PAD]", "[UNK]", "[START]", "[END]"]

#and then we define bert_tokenizer_params dictionary that contains parameters for the BERT tokenizer. In this case, lower_case is set to True, which means that the tokenizer should convert all text to lowercase before tokenizing. This is a common preprocessing step used in NLP tasks.
bert_tokenizer_params=dict(lower_case=True)
# we will add these two variables to the arguments beside The target vocabulary size
bert_vocab_args = dict(
    	reserved_tokens = special_tokens,
    	bert_tokenizer_params = bert_tokenizer_params,
    	vocab_size = 8000,
      # also we should consider arguments related to wordpiece tokenization. However,
      #we left it empty, there no specific parameters we need to add .
      learn_params={},
)

# now we can generate the vocabularies for Russian and English
russian_vocab = bert_vocab.bert_vocab_from_dataset( train_russian.batch(1000).prefetch(2),
                                               **bert_vocab_args )

english_vocab = bert_vocab.bert_vocab_from_dataset( train_english.batch(1000).prefetch(2),
                                              **bert_vocab_args )


In [10]:
print("Russian ")
print(russian_vocab[:10])
print(russian_vocab[100:110])
print(russian_vocab[-5:])
print("English")
print(english_vocab[:10])
print(english_vocab[100:110])
print(english_vocab[-5:])


Russian 
['[PAD]', '[UNK]', '[START]', '[END]', '!', '#', '$', '%', '&', "'"]
['ш', 'щ', 'ъ', 'ы', 'ь', 'э', 'ю', 'я', 'і', '՛']
['##′', '##⁄', '##∇', '##♪', '##♫']
English
['[PAD]', '[UNK]', '[START]', '[END]', '!', '#', '$', '%', '&', "'"]
['##s', 'have', 'but', 'what', 'on', 'do', 'with', 'can', 'there', 'about']
['##”', '##•', '##∇', '##♪', '##♫']


#### save vocabulary files

In [11]:
def write_vocab_file(filepath, vocab):
  with open(filepath, 'w') as f:
    for token in vocab:
      print(token, file=f)

write_vocab_file('russian_vocab.txt', russian_vocab)
write_vocab_file('english_vocab.txt', english_vocab)


#### to build the tokenizer

In [14]:
russian_tokenizer = text.BertTokenizer('russian_vocab.txt', **bert_tokenizer_params)
english_tokenizer = text.BertTokenizer('english_vocab.txt', **bert_tokenizer_params)


In [17]:
for russian_examples, english_examples in train_examples.batch(3).take(1):
  for ex in english_examples:
    print(ex.numpy())


b'c : success , the change is only coming through the barrel of the gun .'
b'the documentation and the hands-on teaching methodology is also open-source and released as the creative commons .'
b"( video ) didi pickles : it 's four o'clock in the morning ."


In [18]:
# then Run it through the Bert Tokenizer dot tokenize method. Initially, this returns a Ragged Tensor with axes (batch, word, word-piece) ,
#for you to know that A Ragged Tensor is a specific data structure used in TensorFlow to handle irregular, nested, or variable-length data.
#It is designed to work with sequences or sets of data where the individual elements have different lengths. In contrast to regular tensors,
#Ragged Tensors allow for varying lengths along one or more dimensions.

token_batch = english_tokenizer.tokenize(english_examples)
token_batch



<tf.RaggedTensor [[[41], [28], [1103], [14], [84], [243], [93], [200], [389], [218], [84],
  [6405], [87], [84], [2473], [16]]                                      ,
 [[84], [3914, 464], [85], [84], [702], [15], [104], [1495], [2346, 2024],
  [93], [187], [435], [15], [942], [85], [2533], [111], [84], [1068],
  [5725], [16]]                                                           ,
 [[10], [400], [11], [168, 379], [1026, 1125], [28], [90], [9], [57], [316],
  [53], [9], [2501], [89], [84], [813], [16]]                               ]>

In [19]:
# then we Merge the word and word-piece axes to get our familiar shape of (batch and tokens)
token_batch = token_batch.merge_dims(-2,-1)

for ex in token_batch.to_list():
  print(ex)

[41, 28, 1103, 14, 84, 243, 93, 200, 389, 218, 84, 6405, 87, 84, 2473, 16]
[84, 3914, 464, 85, 84, 702, 15, 104, 1495, 2346, 2024, 93, 187, 435, 15, 942, 85, 2533, 111, 84, 1068, 5725, 16]
[10, 400, 11, 168, 379, 1026, 1125, 28, 90, 9, 57, 316, 53, 9, 2501, 89, 84, 813, 16]


### add start and end token to each sentence

In [20]:
special_tokens=["[PAD]", "[UNK]", "[START]", "[END]"]

# we will store the indices of the [START] and [END] tokens in the reserved_tokens list, respectively.
#They are computed using TensorFlow's argmax function in combination with boolean comparison.

START = tf.argmax(tf.constant(special_tokens) == "[START]") # <tf.Tensor: shape=(), dtype=int64, numpy=3>
END = tf.argmax(tf.constant(special_tokens) == "[END]") # <tf.Tensor: shape=(), dtype=int64, numpy=3>
#We can use add_start_end function that takes a ragged tensor as input and adds [START] and [END] tokens to each sequence within the tensor
def add_start_end(ragged):
  count = ragged.bounding_shape()[0]
  starts = tf.fill([count,1], START)
  ends = tf.fill([count,1], END)
  return tf.concat([starts, ragged, ends], axis=1)

token_batch = add_start_end(token_batch)
print(token_batch)

<tf.RaggedTensor [[2, 41, 28, 1103, 14, 84, 243, 93, 200, 389, 218, 84, 6405, 87, 84, 2473,
  16, 3]                                                                  ,
 [2, 84, 3914, 464, 85, 84, 702, 15, 104, 1495, 2346, 2024, 93, 187, 435,
  15, 942, 85, 2533, 111, 84, 1068, 5725, 16, 3]                         ,
 [2, 10, 400, 11, 168, 379, 1026, 1125, 28, 90, 9, 57, 316, 53, 9, 2501, 89,
  84, 813, 16, 3]                                                           ]>


In [21]:
# If you replace the token IDs with their text representations  using tensorflow dot gather , you will see that in the first example the words "searchability" and "serendipity" have been decomposed.
txt_tokens = tf.gather(english_vocab, token_batch)
# then join tokens with spaces to get the sentences.
tf.strings.reduce_join(txt_tokens, separator=' ', axis=-1)


<tf.Tensor: shape=(3,), dtype=string, numpy=
array([b'[START] c : success , the change is only coming through the barrel of the gun . [END]',
       b'[START] the document ##ation and the hands - on teaching method ##ology is also open - source and released as the creative commons . [END]',
       b"[START] ( video ) did ##i pick ##les : it ' s four o ' clock in the morning . [END]"],
      dtype=object)>

In [22]:
#we can re-assemble words from the extracted tokens directly with, the BertTokenizer dot detokenize method
words = english_tokenizer.detokenize(token_batch)
words

<tf.RaggedTensor [[b'[START]', b'c', b':', b'success', b',', b'the', b'change', b'is',
  b'only', b'coming', b'through', b'the', b'barrel', b'of', b'the', b'gun',
  b'.', b'[END]']                                                          ,
 [b'[START]', b'the', b'documentation', b'and', b'the', b'hands', b'-',
  b'on', b'teaching', b'methodology', b'is', b'also', b'open', b'-',
  b'source', b'and', b'released', b'as', b'the', b'creative', b'commons',
  b'.', b'[END]']                                                        ,
 [b'[START]', b'(', b'video', b')', b'didi', b'pickles', b':', b'it', b"'",
  b's', b'four', b'o', b"'", b'clock', b'in', b'the', b'morning', b'.',
  b'[END]']                                                                ]>

In [23]:
#And again join tokens with spaces to form the sentences.
words = english_tokenizer.detokenize(token_batch)
tf.strings.reduce_join(words, separator=' ', axis=-1)


<tf.Tensor: shape=(3,), dtype=string, numpy=
array([b'[START] c : success , the change is only coming through the barrel of the gun . [END]',
       b'[START] the documentation and the hands - on teaching methodology is also open - source and released as the creative commons . [END]',
       b"[START] ( video ) didi pickles : it ' s four o ' clock in the morning . [END]"],
      dtype=object)>

### clean up text for detokenization

In [24]:
def cleanup_text(special_tokens, token_txt):
  # Drop the reserved tokens, except for "[UNK]".
  bad_tokens = [re.escape(tok) for tok in special_tokens if tok != "[UNK]"]
  bad_token_re = "|".join(bad_tokens)
  bad_cells = tf.strings.regex_full_match(token_txt, bad_token_re)
  result = tf.ragged.boolean_mask(token_txt, ~bad_cells)

  # Join them into strings.
  result = tf.strings.reduce_join(result, separator=' ', axis=-1)
  return result


In [25]:
for russian_examples, english_examples in train_examples.batch(3).take(1):
  for ex in english_examples:
    print(ex.numpy())

b'c : success , the change is only coming through the barrel of the gun .'
b'the documentation and the hands-on teaching methodology is also open-source and released as the creative commons .'
b"( video ) didi pickles : it 's four o'clock in the morning ."


In [26]:
token_batch = english_tokenizer.tokenize(english_examples).merge_dims(-2,-1)
token_batc = add_start_end(token_batch)
words = english_tokenizer.detokenize(token_batc)
detok_text = cleanup_text(special_tokens, words).numpy()
detok_text

array([b'c : success , the change is only coming through the barrel of the gun .',
       b'the documentation and the hands - on teaching methodology is also open - source and released as the creative commons .',
       b"( video ) didi pickles : it ' s four o ' clock in the morning ."],
      dtype=object)

### Export and save the tokenizer

In [27]:
class CustomTokenizer(tf.Module):
  def __init__(self, reserved_tokens, vocab_path):
    self.tokenizer = text.BertTokenizer(vocab_path, lower_case=True)
    self._special_tokens = special_tokens
    self._vocab_path = tf.saved_model.Asset(vocab_path)

    vocab = pathlib.Path(vocab_path).read_text().splitlines()
    self.vocab = tf.Variable(vocab)

    ## Create the signatures for export:

    # Include a tokenize signature for a batch of strings.
    self.tokenize.get_concrete_function(
        tf.TensorSpec(shape=[None], dtype=tf.string))

    # Include `detokenize` and `lookup` signatures for:
    #   * `Tensors` with shapes [tokens] and [batch, tokens]
    #   * `RaggedTensors` with shape [batch, tokens]
    self.detokenize.get_concrete_function(
        tf.TensorSpec(shape=[None, None], dtype=tf.int64))
    self.detokenize.get_concrete_function(
          tf.RaggedTensorSpec(shape=[None, None], dtype=tf.int64))

    self.lookup.get_concrete_function(
        tf.TensorSpec(shape=[None, None], dtype=tf.int64))
    self.lookup.get_concrete_function(
          tf.RaggedTensorSpec(shape=[None, None], dtype=tf.int64))

    # These `get_*` methods take no arguments
    self.get_vocab_size.get_concrete_function()
    self.get_vocab_path.get_concrete_function()
    self.get_reserved_tokens.get_concrete_function()

  @tf.function
  def tokenize(self, strings):
    enc = self.tokenizer.tokenize(strings)
    # Merge the `word` and `word-piece` axes.
    enc = enc.merge_dims(-2,-1)
    enc = add_start_end(enc)
    return enc

  @tf.function
  def detokenize(self, tokenized):
    words = self.tokenizer.detokenize(tokenized)
    return cleanup_text(self._special_tokens, words)

  @tf.function
  def lookup(self, token_ids):
    return tf.gather(self.vocab, token_ids)

  @tf.function
  def get_vocab_size(self):
    return tf.shape(self.vocab)[0]

  @tf.function
  def get_vocab_path(self):
    return self._vocab_path

  @tf.function
  def get_reserved_tokens(self):
    return tf.constant(self._special_tokens)

In [28]:
tokenizers = tf.Module()
tokenizers.russian = CustomTokenizer(special_tokens, 'russian_vocab.txt')
tokenizers.english = CustomTokenizer(special_tokens, 'english_vocab.txt')


In [29]:
model_name = 'russian_english_tokenizer'
tf.saved_model.save(tokenizers, model_name)


### Reload the tokenizer

In [30]:
reloaded_tokenizers = tf.saved_model.load(model_name)
reloaded_tokenizers.english.get_vocab_size().numpy()

7796

In [31]:
reloaded_tokenizers = tf.saved_model.load(model_name)
tokens = reloaded_tokenizers.russian.tokenize(['Привет, AI Mastery!'])
tokens.numpy()


array([[   2, 3322,   14,   41,  969,   53,  770,  539, 3986, 1215,    4,
           3]])

In [32]:
text_tokens = reloaded_tokenizers.english.lookup(tokens)
text_tokens

<tf.RaggedTensor [[b'[START]', b'precisely', b',', b'c', b'##et', b'o', b'spent', b'##ion',
  b'biologists', b'americans', b'!', b'[END]']]>

In [33]:
round_trip = reloaded_tokenizers.russian.detokenize(tokens)
print(round_trip.numpy()[0].decode('utf-8'))


привет , ai mastery !


### save it as zip file

In [34]:
!zip -r {model_name}.zip {model_name}

  adding: russian_english_tokenizer/ (stored 0%)
  adding: russian_english_tokenizer/assets/ (stored 0%)
  adding: russian_english_tokenizer/assets/russian_vocab.txt (deflated 70%)
  adding: russian_english_tokenizer/assets/english_vocab.txt (deflated 54%)
  adding: russian_english_tokenizer/variables/ (stored 0%)
  adding: russian_english_tokenizer/variables/variables.data-00000-of-00001 (deflated 59%)
  adding: russian_english_tokenizer/variables/variables.index (deflated 32%)
  adding: russian_english_tokenizer/saved_model.pb (deflated 91%)
  adding: russian_english_tokenizer/fingerprint.pb (stored 0%)
