[Source: Kaggle "Spanish to English Translation" dataset](https://www.kaggle.com/code/sharanharsoor/spanish-to-english-translation/notebook#Create-source-and-target-sequences-using-tokenize())

Lines 3 to 29 use code from source. These lines import the datset and process the data to be used by models.

In [7]:
import os, io
import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import tensorflow as tf
import warnings
import pathlib
from itertools import islice

In [2]:
tf.get_logger().setLevel('ERROR')
pwd = pathlib.Path.cwd()

In [None]:
# lines 9 to 11. Overview: Download and visualize data from Keggle(SHRAN HARSOOR)

In [3]:
zipFile = tf.keras.utils.get_file(
    'spa-eng.zip', origin='http://storage.googleapis.com/download.tensorflow.org/data/spa-eng.zip',
    extract=True)

#filePath = os.path.dirname(zipFile)+"/spa-eng/spa.txt"
filePath = "/root/.keras/datasets/spa-eng_extracted/spa-eng/spa.txt"

Downloading data from http://storage.googleapis.com/download.tensorflow.org/data/spa-eng.zip
[1m2638744/2638744[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 0us/step


In [4]:
! head -10 /root/.keras/datasets/spa-eng_extracted/spa-eng/spa.txt

Go.	Ve.
Go.	Vete.
Go.	Vaya.
Go.	Váyase.
Hi.	Hola.
Run!	¡Corre!
Run.	Corred.
Who?	¿Quién?
Fire!	¡Fuego!
Fire!	¡Incendio!


In [8]:
def loadData(path, size=20000):
  #text = io.open(filePath, encoding='UTF-8').read()
  #lines = text.splitlines()
  #pairs = [line.split('\t') for line in lines]
  with io.open(path, 'r', encoding='utf-8', newline='') as f:
    lines = [line.rstrip('\n') for line in islice(f, size)]
  pairs = [line.split('\t') for line in lines if '\t' in line]
  sp = np.array([source for target, source in pairs])
  en = np.array([target for target, source in pairs])
  return sp, en

In [9]:
spSentences , enSentences = loadData(filePath)
print("sp sentences:", spSentences[42])
print("en sentences:", enSentences[42])

sp sentences: Yo lo sé.
en sentences: I know.


In [None]:
df = pd.DataFrame(zip(spSentences, enSentences), columns=['sp', 'en'])
df

Unnamed: 0,sp,en
0,Ve.,Go.
1,Vete.,Go.
2,Vaya.,Go.
3,Váyase.,Go.
4,Hola.,Hi.
...,...,...
118959,Hay cuatro causas principales de muertes relac...,There are four main causes of alcohol-related ...
118960,Hay madres y padres que se quedan despiertos d...,There are mothers and fathers who will lie awa...
118961,Una huella de carbono es la cantidad de contam...,A carbon footprint is the amount of carbon dio...
118962,Como suele haber varias páginas web sobre cual...,Since there are usually multiple websites on a...


In [None]:
# The following lines process the data by standardization anf tokenize.

In [None]:
import re, itertools
from collections import Counter
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

In [None]:
import unicodedata

def unicode_to_ascii(s):
    normalized = unicodedata.normalize('NFD', s)
    return ''.join(c for c in normalized if unicodedata.category(c) != 'Mn')

def preprocess_text(text):
  text = unicode_to_ascii(text.lower().strip())
  text = re.sub(r"[^a-zA-Z?.!,¿]+", " ", text)
  text = re.sub(r"([?.!,¿])", r" \1 ", text)
  text = re.sub(r'[" "]+', " ", text)
  text = text.rstrip().strip()

  return text

In [None]:
print('Original sentence:',spSentences[42])
prc_sp_sentences = [preprocess_text(w) for w in spSentences]
prc_en_sentences = [preprocess_text(w) for w in enSentences]
print('Preprocessed sentence:',prc_sp_sentences[42])

Original sentence: Yo lo sé.
Preprocessed sentence: yo lo se .


In [None]:
# the following lines will split the datasets into a train and test datasets.
# train data is used to "train" the model and test data is used to eval model.
spSentencesTrain, spSentencesVal, enSentencesTrain, enSentencesVal = train_test_split(prc_sp_sentences, prc_en_sentences, shuffle=False, test_size=0.2)
print(len(spSentencesTrain), len(spSentencesVal), len(enSentencesTrain), len(enSentencesVal))

95171 23793 95171 23793


In [None]:
print(spSentencesTrain[1])
print(spSentencesVal[1])
print(enSentencesTrain[1])
print(enSentencesVal[1])

vete .
te lo dije , pero no hiciste caso .
go .
i told you , but you didn t pay attention .


In [None]:
#defieneing the hyperparameters that will used within the models

# bufferSire -- bufffer size for the training data
bufferSize = len(spSentencesTrain)
# valBufferSize -- buffer size for the validatation data
valBufferSize = len(spSentencesVal)
# BATCHSIZE -- batch size for the training data
BATCHSIZE = 100
# embeddingDim -- embedding dimension for the input data
embeddingDim = 128
# units -- number of units in. the RNN used for model
units = 1024
# stepsPerEpoch -- number of steps to take each epoch of training
stepsPerEpoch = bufferSize
# valStepsPerEpoch -- validation steps to take during each epoch of training
valStepsPerEpoch = valBufferSize

In [None]:
trainDatasetUnbatched = tf.data.Dataset.from_tensor_slices((spSentencesTrain, enSentencesTrain))
trainDataset = trainDatasetUnbatched.shuffle(buffer_size=bufferSize).batch(BATCHSIZE)
valDatasetUnbatched = tf.data.Dataset.from_tensor_slices((spSentencesVal, enSentencesVal))
valDataset = valDatasetUnbatched.batch(BATCHSIZE)

In [None]:
##train_unbatched = trainDataset.unbatch()
train_en = trainDatasetUnbatched.map(lambda sp, en: en)
train_sp = trainDatasetUnbatched.map(lambda sp, en: sp)

In [None]:
from tensorflow_text.tools.wordpiece_vocab import bert_vocab_from_dataset as bert_vocab

In [None]:
import logging
import time
import tensorflow_datasets as tfds
import tensorflow_text as text


In [None]:
bert_tokenizer_params=dict(lower_case=True)
reserved_tokens=["[PAD]", "[UNK]", "[START]", "[END]"]

bert_vocab_args = dict(
    # The target vocabulary size
    vocab_size = 8000,
    # Reserved tokens that must be included in the vocabulary
    reserved_tokens=reserved_tokens,
    # Arguments for `text.BertTokenizer`
    bert_tokenizer_params=bert_tokenizer_params,
    # Arguments for `wordpiece_vocab.wordpiece_tokenizer_learner_lib.learn`
    learn_params={},
)

In [None]:
%%time
sp_vocab = bert_vocab.bert_vocab_from_dataset(
    train_sp.batch(1000).prefetch(2),
    **bert_vocab_args
)

CPU times: user 48.7 s, sys: 403 ms, total: 49.1 s
Wall time: 48.2 s


In [None]:
print(sp_vocab[:10])
print(sp_vocab[100:110])
print(sp_vocab[1000:1010])
print(sp_vocab[-10:])

['[PAD]', '[UNK]', '[START]', '[END]', '!', ',', '.', '?', 'a', 'b']
['puede', 'tomas', 'tan', 'mucho', '##mos', 'tiempo', 'era', 'nos', 'cuando', 'ahora']
['verlo', 'vuelve', '##ido', '##lar', '##las', '##tado', 'asiento', 'caballo', 'escribio', 'espere']
['##,', '##.', '##?', '##b', '##f', '##j', '##q', '##v', '##x', '##¿']


In [None]:
def write_vocab_file(filepath, vocab):
  with open(filepath, 'w') as f:
    for token in vocab:
      print(token, file=f)

In [None]:
write_vocab_file('sp_vocab.txt', sp_vocab)

In [None]:
%%time
en_vocab = bert_vocab.bert_vocab_from_dataset(
    train_en.batch(1000).prefetch(2),
    **bert_vocab_args
)

CPU times: user 24.5 s, sys: 387 ms, total: 24.9 s
Wall time: 24.5 s


In [None]:
print(en_vocab[:10])
print(en_vocab[100:110])
print(en_vocab[1000:1010])
print(en_vocab[-10:])

['[PAD]', '[UNK]', '[START]', '[END]', '!', ',', '.', '?', 'a', 'b']
['where', 'good', 'out', 'see', 'who', 'doesn', 'from', 'going', 'one', 'were']
['winter', '##ible', 'carry', 'charge', 'eight', 'grandfather', 'piece', 'shop', 'smell', 'solve']
['wings', 'yard', '##!', '##,', '##.', '##?', '##j', '##q', '##v', '##z']


In [None]:
write_vocab_file('en_vocab.txt', en_vocab)

In [None]:
ls *.txt

en_vocab.txt  sp_vocab.txt


In [None]:
sp_tokenizer = text.BertTokenizer('sp_vocab.txt', **bert_tokenizer_params)
en_tokenizer = text.BertTokenizer('en_vocab.txt', **bert_tokenizer_params)

In [None]:
for sp_examples, en_examples in trainDataset.take(1):
  for ex in en_examples:
    print(ex.numpy())

b'who does that belong to ?'
b'warn tom .'
b'i just wanted to see you .'
b'the plans were discarded .'
b'all the students come from the us .'
b'since i was sick , i didn t go .'
b'you have my word .'
b'my husband always reads in bed .'
b'all the students were there .'
b'my heart was filled with happiness .'
b'no one speaks this language anymore .'
b'i walk in the forest every day .'
b'you don t need to work today .'
b'i know it s an experiment .'
b'we want our money .'
b'this room doesn t get much sun .'
b'no one will believe you .'
b'i m not telling you .'
b'that s what you all say .'
b'i have a bad pain in my back .'
b'i have a special surprise for you .'
b'i screamed .'
b'tell me what you want for christmas .'
b'half of these are mine .'
b'we re in love .'
b'it is wrong to steal money .'
b'no one knows why .'
b'i recommend maui .'
b'thanks for your cooperation .'
b'just do what i tell you .'
b'his smug behavior is offensive .'
b'don t point your gun at me .'
b'tom hugged mary tightl

In [None]:
token_batch = en_tokenizer.tokenize(en_examples)
token_batch = token_batch.merge_dims(-2, -1)

for ex in token_batch.to_list():
  print(ex)

[104, 143, 43, 1528, 35, 7]
[1639, 37, 6]
[16, 111, 203, 35, 103, 34, 6]
[36, 1036, 109, 11, 1564, 2593, 1260, 121, 6]
[83, 36, 596, 93, 106, 36, 120, 6]
[737, 16, 48, 427, 5, 16, 85, 27, 71, 6]
[34, 47, 46, 456, 6]
[46, 801, 171, 2238, 41, 275, 6]
[83, 36, 596, 109, 73, 6]
[46, 698, 48, 1243, 67, 1540, 6]
[98, 108, 554, 45, 721, 405, 6]
[16, 392, 41, 36, 1914, 191, 139, 6]
[34, 53, 27, 99, 35, 130, 140, 6]
[16, 65, 40, 26, 96, 2804, 6]
[50, 58, 167, 135, 6]
[45, 156, 105, 27, 90, 132, 868, 6]
[98, 108, 81, 237, 34, 6]
[16, 20, 64, 1018, 34, 6]
[43, 26, 52, 34, 83, 160, 6]
[16, 47, 8, 270, 636, 41, 46, 137, 6]
[16, 47, 8, 1314, 964, 56, 34, 6]
[16, 2447, 6]
[119, 44, 52, 34, 58, 56, 749, 6]
[773, 49, 213, 54, 327, 6]
[50, 72, 41, 158, 6]
[40, 38, 253, 35, 2021, 135, 6]
[98, 108, 249, 95, 6]
[16, 1864, 20, 407, 1525, 870, 6]
[644, 56, 57, 10, 3149, 2764, 782, 6]
[111, 42, 52, 16, 119, 34, 6]
[60, 26, 970, 1788, 1527, 38, 196, 2900, 688, 6]
[53, 27, 927, 57, 934, 68, 44, 6]
[37, 2321, 59

In [None]:
# Lookup each token id in the vocabulary.
txt_tokens = tf.gather(en_vocab, token_batch)
# Join with spaces.
tf.strings.reduce_join(txt_tokens, separator=' ', axis=-1)

<tf.Tensor: shape=(100,), dtype=string, numpy=
array([b'who does that belong to ?', b'warn tom .',
       b'i just wanted to see you .',
       b'the plans were d ##is ##c ##ard ##ed .',
       b'all the students come from the us .',
       b'since i was sick , i didn t go .', b'you have my word .',
       b'my husband always reads in bed .',
       b'all the students were there .',
       b'my heart was filled with happiness .',
       b'no one speaks this language anymore .',
       b'i walk in the forest every day .',
       b'you don t need to work today .', b'i know it s an experiment .',
       b'we want our money .', b'this room doesn t get much sun .',
       b'no one will believe you .', b'i m not telling you .',
       b'that s what you all say .', b'i have a bad pain in my back .',
       b'i have a special surprise for you .', b'i screamed .',
       b'tell me what you want for christmas .',
       b'half of these are mine .', b'we re in love .',
       b'it is wrong to ste

In [None]:
words = en_tokenizer.detokenize(token_batch)
tf.strings.reduce_join(words, separator=' ', axis=-1)

<tf.Tensor: shape=(100,), dtype=string, numpy=
array([b'who does that belong to ?', b'warn tom .',
       b'i just wanted to see you .', b'the plans were discarded .',
       b'all the students come from the us .',
       b'since i was sick , i didn t go .', b'you have my word .',
       b'my husband always reads in bed .',
       b'all the students were there .',
       b'my heart was filled with happiness .',
       b'no one speaks this language anymore .',
       b'i walk in the forest every day .',
       b'you don t need to work today .', b'i know it s an experiment .',
       b'we want our money .', b'this room doesn t get much sun .',
       b'no one will believe you .', b'i m not telling you .',
       b'that s what you all say .', b'i have a bad pain in my back .',
       b'i have a special surprise for you .', b'i screamed .',
       b'tell me what you want for christmas .',
       b'half of these are mine .', b'we re in love .',
       b'it is wrong to steal money .', b'no o

CUSTOMIZATION AND EXPORTTTT

In [None]:
START = tf.argmax(tf.constant(reserved_tokens) == "[START]")
END = tf.argmax(tf.constant(reserved_tokens) == "[END]")

def add_start_end(ragged):
  count = ragged.bounding_shape()[0]
  starts = tf.fill([count,1], START)
  ends = tf.fill([count,1], END)
  return tf.concat([starts, ragged, ends], axis=1)

In [None]:
words = en_tokenizer.detokenize(add_start_end(token_batch))
tf.strings.reduce_join(words, separator=' ', axis=-1)

<tf.Tensor: shape=(100,), dtype=string, numpy=
array([b'[START] who does that belong to ? [END]',
       b'[START] warn tom . [END]',
       b'[START] i just wanted to see you . [END]',
       b'[START] the plans were discarded . [END]',
       b'[START] all the students come from the us . [END]',
       b'[START] since i was sick , i didn t go . [END]',
       b'[START] you have my word . [END]',
       b'[START] my husband always reads in bed . [END]',
       b'[START] all the students were there . [END]',
       b'[START] my heart was filled with happiness . [END]',
       b'[START] no one speaks this language anymore . [END]',
       b'[START] i walk in the forest every day . [END]',
       b'[START] you don t need to work today . [END]',
       b'[START] i know it s an experiment . [END]',
       b'[START] we want our money . [END]',
       b'[START] this room doesn t get much sun . [END]',
       b'[START] no one will believe you . [END]',
       b'[START] i m not telling you . [

In [None]:
# detokenization
def cleanup_text(reserved_tokens, token_txt):
  # drop reserved tokens except for [UNK]
  bad_tokens = [re.escape(tok) for tok in reserved_tokens if tok != "[UNK]"]
  bad_token_re = "|".join(bad_tokens)

  bad_cells = tf.strings.regex_full_match(token_txt, bad_token_re)
  result = tf.ragged.boolean_mask(token_txt, ~bad_cells)

  # Join them into strings
  result = tf.strings.reduce_join(result, separator=' ', axis=-1)
  return result

In [None]:
en_examples.numpy()

array([b'who does that belong to ?', b'warn tom .',
       b'i just wanted to see you .', b'the plans were discarded .',
       b'all the students come from the us .',
       b'since i was sick , i didn t go .', b'you have my word .',
       b'my husband always reads in bed .',
       b'all the students were there .',
       b'my heart was filled with happiness .',
       b'no one speaks this language anymore .',
       b'i walk in the forest every day .',
       b'you don t need to work today .', b'i know it s an experiment .',
       b'we want our money .', b'this room doesn t get much sun .',
       b'no one will believe you .', b'i m not telling you .',
       b'that s what you all say .', b'i have a bad pain in my back .',
       b'i have a special surprise for you .', b'i screamed .',
       b'tell me what you want for christmas .',
       b'half of these are mine .', b'we re in love .',
       b'it is wrong to steal money .', b'no one knows why .',
       b'i recommend maui .', 

In [None]:
token_batch = en_tokenizer.tokenize(en_examples).merge_dims(-2, -1)
words = en_tokenizer.detokenize(token_batch)
words

<tf.RaggedTensor [[b'who', b'does', b'that', b'belong', b'to', b'?'],
 [b'warn', b'tom', b'.'],
 [b'i', b'just', b'wanted', b'to', b'see', b'you', b'.'],
 [b'the', b'plans', b'were', b'discarded', b'.'],
 [b'all', b'the', b'students', b'come', b'from', b'the', b'us', b'.'],
 [b'since', b'i', b'was', b'sick', b',', b'i', b'didn', b't', b'go', b'.'],
 [b'you', b'have', b'my', b'word', b'.'],
 [b'my', b'husband', b'always', b'reads', b'in', b'bed', b'.'],
 [b'all', b'the', b'students', b'were', b'there', b'.'],
 [b'my', b'heart', b'was', b'filled', b'with', b'happiness', b'.'],
 [b'no', b'one', b'speaks', b'this', b'language', b'anymore', b'.'],
 [b'i', b'walk', b'in', b'the', b'forest', b'every', b'day', b'.'],
 [b'you', b'don', b't', b'need', b'to', b'work', b'today', b'.'],
 [b'i', b'know', b'it', b's', b'an', b'experiment', b'.'],
 [b'we', b'want', b'our', b'money', b'.'],
 [b'this', b'room', b'doesn', b't', b'get', b'much', b'sun', b'.'],
 [b'no', b'one', b'will', b'believe', b'you',

In [None]:
cleanup_text(reserved_tokens, words).numpy()

array([b'who does that belong to ?', b'warn tom .',
       b'i just wanted to see you .', b'the plans were discarded .',
       b'all the students come from the us .',
       b'since i was sick , i didn t go .', b'you have my word .',
       b'my husband always reads in bed .',
       b'all the students were there .',
       b'my heart was filled with happiness .',
       b'no one speaks this language anymore .',
       b'i walk in the forest every day .',
       b'you don t need to work today .', b'i know it s an experiment .',
       b'we want our money .', b'this room doesn t get much sun .',
       b'no one will believe you .', b'i m not telling you .',
       b'that s what you all say .', b'i have a bad pain in my back .',
       b'i have a special surprise for you .', b'i screamed .',
       b'tell me what you want for christmas .',
       b'half of these are mine .', b'we re in love .',
       b'it is wrong to steal money .', b'no one knows why .',
       b'i recommend maui .', 

In [None]:
class CustomTokenizer(tf.Module):
  def __init__(self, reserved_tokens, vocab_path):
    self.tokenizer = text.BertTokenizer(vocab_path, lower_case=True)
    self._reserved_tokens = reserved_tokens
    self._vocab_path = tf.saved_model.Asset(vocab_path)

    vocab = pathlib.Path(vocab_path).read_text().splitlines()
    self.vocab = tf.Variable(vocab)

    ## Create the signatures for export:

    # Include a tokenize signature for a batch of strings.
    self.tokenize.get_concrete_function(
        tf.TensorSpec(shape=[None], dtype=tf.string))

    # Include `detokenize` and `lookup` signatures for:
    #   * `Tensors` with shapes [tokens] and [batch, tokens]
    #   * `RaggedTensors` with shape [batch, tokens]
    self.detokenize.get_concrete_function(
        tf.TensorSpec(shape=[None, None], dtype=tf.int64))
    self.detokenize.get_concrete_function(
          tf.RaggedTensorSpec(shape=[None, None], dtype=tf.int64))

    self.lookup.get_concrete_function(
        tf.TensorSpec(shape=[None, None], dtype=tf.int64))
    self.lookup.get_concrete_function(
          tf.RaggedTensorSpec(shape=[None, None], dtype=tf.int64))

    # These `get_*` methods take no arguments
    self.get_vocab_size.get_concrete_function()
    self.get_vocab_path.get_concrete_function()
    self.get_reserved_tokens.get_concrete_function()

  @tf.function
  def tokenize(self, strings):
    enc = self.tokenizer.tokenize(strings)
    # Merge the `word` and `word-piece` axes.
    enc = enc.merge_dims(-2,-1)
    enc = add_start_end(enc)
    return enc

  @tf.function
  def detokenize(self, tokenized):
    words = self.tokenizer.detokenize(tokenized)
    return cleanup_text(self._reserved_tokens, words)

  @tf.function
  def lookup(self, token_ids):
    return tf.gather(self.vocab, token_ids)

  @tf.function
  def get_vocab_size(self):
    return tf.shape(self.vocab)[0]

  @tf.function
  def get_vocab_path(self):
    return self._vocab_path

  @tf.function
  def get_reserved_tokens(self):
    return tf.constant(self._reserved_tokens)

In [None]:
# buiild tokenizer for eazh language

tokenizers = tf.Module()
tokenizers.en = CustomTokenizer(reserved_tokens, 'en_vocab.txt')
tokenizers.sp = CustomTokenizer(reserved_tokens, 'sp_vocab.txt')

In [None]:
# export as a saved_model
model_name = 'sp_en_converter'
tf.saved_model.save(tokenizers, model_name)

In [None]:
reloaded_tokenizers = tf.saved_model.load(model_name)
reloaded_tokenizers.en.get_vocab_size().numpy()

np.int32(3333)

In [None]:
tokens = reloaded_tokenizers.en.tokenize(['Hello TensorFlow!'])
tokens.numpy()

array([[   2, 1581,  377,   66,  689, 1391, 2279,    4,    3]])

In [None]:
text_tokens = reloaded_tokenizers.en.lookup(tokens)
text_tokens

<tf.RaggedTensor [[b'[START]', b'hello', b'ten', b'##s', b'##or', b'##f', b'##low', b'!',
  b'[END]']]>

In [None]:
round_trip = reloaded_tokenizers.en.detokenize(tokens)

print(round_trip.numpy()[0].decode('utf-8'))

hello tensorflow !


In [None]:
!zip -r {model_name}.zip {model_name}

  adding: sp_en_converter/ (stored 0%)
  adding: sp_en_converter/fingerprint.pb (stored 0%)
  adding: sp_en_converter/saved_model.pb (deflated 91%)
  adding: sp_en_converter/assets/ (stored 0%)
  adding: sp_en_converter/assets/sp_vocab.txt (deflated 56%)
  adding: sp_en_converter/assets/en_vocab.txt (deflated 52%)
  adding: sp_en_converter/variables/ (stored 0%)
  adding: sp_en_converter/variables/variables.data-00000-of-00001 (deflated 50%)
  adding: sp_en_converter/variables/variables.index (deflated 33%)


In [None]:
!du -h *.zip

120K	sp_en_converter.zip


In [None]:
!path model_name

/bin/bash: line 1: path: command not found
