In [6]:
# imports
import tensorflow.compat.v2 as tf
import tensorflow_datasets as tfds

In [7]:
class Dataset:
	"""
	Loads and prepares a dataset for machine translation
	"""
	def __init__(self):
		"""
		Class constructor for Dataset class
		"""
		self.data_train = tfds.load('ted_hrlr_translate/pt_to_en',
									split='train',
									as_supervised=True)
		self.data_valid = tfds.load('ted_hrlr_translate/pt_to_en',
									split='validation',
									as_supervised=True)
		self.tokenizer_pt, self.tokenizer_en = self.tokenize_dataset(
			self.data_train
			)

	def tokenize_dataset(self, data):
		"""
		Creates sub-word tokenizers for the dataset
		Args:
			data: tf.data.Dataset whose examples are formatted as a tuple
					(pt, en)
				pt: tf.Tensor containing the Portuguese sentence
				en: tf.Tensor containing the corresponding English sentence
		Maximum Vocab size should be set to 2**15
		Returns:
			tokenizer_pt: The Portuguese tokenizer
			tokenizer_en: The English tokenizer
		"""
		token_pt = tfds.deprecated.text.SubwordTextEncoder.build_from_corpus(
			(pt.numpy() for pt, _ in data),
			target_vocab_size=2**15
			)
		token_en = tfds.deprecated.text.SubwordTextEncoder.build_from_corpus(
			(en.numpy() for _, en in data),
			target_vocab_size=2**15
			)

		return token_pt, token_en

	def encode(self, pt, en):
		"""
		Encodes a translation into tokens
		Args:
			pt: the tf.Tensor containing the Portuguese sentence
			en: the tf.Tensore containing the corresponding English sentence

		The tokenized sentences should contain the start end end tokens
		The start token should be indexed as vocab_size
		The end token should be indexed as vocab_size + 1
		Returns:
			pt_tokens: np.ndarray containing the Portuguese tokens
			en_tokens: np.ndarray containing the English tokens
		"""
		pt_start = [self.tokenizer_pt.vocab_size]
		pt_end = [self.tokenizer_pt.vocab_size + 1]
		en_start = [self.tokenizer_en.vocab_size]
		en_end = [self.tokenizer_en.vocab_size + 1]

		pt_tokens = pt_start + self.tokenizer_pt.encode(pt.numpy()) + pt_end
		en_tokens = en_start + self.tokenizer_en.encode(en.numpy()) + en_end

		return pt_tokens, en_tokens

In [8]:
# Task 0 Main
data = Dataset()
for pt, en in data.data_train.take(1):
    print(pt.numpy().decode('utf-8'))
    print(en.numpy().decode('utf-8'))
for pt, en in data.data_valid.take(1):
    print(pt.numpy().decode('utf-8'))
    print(en.numpy().decode('utf-8'))
print(type(data.tokenizer_pt))
print(type(data.tokenizer_en))

e quando melhoramos a procura , tiramos a única vantagem da impressão , que é a serendipidade .
and when you improve searchability , you actually take away the one advantage of print , which is serendipity .
tinham comido peixe com batatas fritas ?
did they eat fish and chips ?
<class 'tensorflow_datasets.core.deprecated.text.subword_text_encoder.SubwordTextEncoder'>
<class 'tensorflow_datasets.core.deprecated.text.subword_text_encoder.SubwordTextEncoder'>


2023-08-15 15:19:29.449872: W tensorflow/core/kernels/data/cache_dataset_ops.cc:856] The calling iterator did not fully read the dataset being cached. In order to avoid unexpected truncation of the dataset, the partially cached contents of the dataset  will be discarded. This can happen if you have an input pipeline similar to `dataset.cache().take(k).repeat()`. You should use `dataset.take(k).cache().repeat()` instead.


In [9]:
# Task 1 Main
data = Dataset()
for pt, en in data.data_train.take(1):
    print(data.encode(pt, en))
for pt, en in data.data_valid.take(1):
    print(data.encode(pt, en))

([30138, 6, 36, 17925, 13, 3, 3037, 1, 4880, 3, 387, 2832, 18, 18444, 1, 5, 8, 3, 16679, 19460, 739, 2, 30139], [28543, 4, 56, 15, 1266, 20397, 10721, 1, 15, 100, 125, 352, 3, 45, 3066, 6, 8004, 1, 88, 13, 14859, 2, 28544])
([30138, 289, 15409, 2591, 19, 20318, 26024, 29997, 28, 30139], [28543, 93, 25, 907, 1366, 4, 5742, 33, 28544])


2023-08-15 15:21:50.957589: W tensorflow/core/kernels/data/cache_dataset_ops.cc:856] The calling iterator did not fully read the dataset being cached. In order to avoid unexpected truncation of the dataset, the partially cached contents of the dataset  will be discarded. This can happen if you have an input pipeline similar to `dataset.cache().take(k).repeat()`. You should use `dataset.take(k).cache().repeat()` instead.
