In [1]:
import os
#Legacy TensorFlow BackEnd
os.environ['TF_USE_LEGACY_KERAS'] = '1'

In [13]:
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
import pathlib
import unicodedata

In [19]:
config={
    "max_vocab_size":5000,
    "max_length":50,
    "BATCH_SIZE":64,
    "Split_Ratio":0.9,
}

In [7]:
path_to_zip = tf.keras.utils.get_file(
    'spa-eng.zip', origin='http://storage.googleapis.com/download.tensorflow.org/data/spa-eng.zip',
    extract=True)

path_to_file = pathlib.Path(path_to_zip).parent/'spa-eng/spa.txt'

In [20]:
def load_data(path):
  text = path.read_text(encoding='utf-8')

  lines = text.splitlines()
  pairs = [line.split('\t') for line in lines]

  context = np.array([context for target, context in pairs])
  target = np.array([target for target, context in pairs])

  return target, context

In [21]:
context_raw,target_raw = load_data(path_to_file)
print(context_raw[-1])

If you want to sound like a native speaker, you must be willing to practice saying the same sentence over and over in the same way that banjo players practice the same phrase over and over until they can play it correctly and at the desired tempo.


In [22]:
print(target_raw[-1])

Si quieres sonar como un hablante nativo, debes estar dispuesto a practicar diciendo la misma frase una y otra vez de la misma manera en que un músico de banjo practica el mismo fraseo una y otra vez hasta que lo puedan tocar correctamente y en el tiempo esperado.


In [23]:
split_idx = int(config["Split_Ratio"] * len(target_raw))

X_train = context_raw[:split_idx]
y_train = target_raw[:split_idx]

X_val = context_raw[split_idx:]
y_val = target_raw[split_idx:]

BUFFER_SIZE = len(X_train)

train_dataset = tf.data.Dataset.from_tensor_slices((X_train, y_train))
train_dataset = train_dataset.shuffle(BUFFER_SIZE).batch(config['BATCH_SIZE'], drop_remainder=True).prefetch(tf.data.AUTOTUNE)

# Validation dataset (no shuffle needed)
val_dataset = tf.data.Dataset.from_tensor_slices((X_val, y_val))
val_dataset = val_dataset.batch(config['BATCH_SIZE'], drop_remainder=True).prefetch(tf.data.AUTOTUNE)

In [24]:
idx = np.random.randint(0,63)
context,target = next(iter(val_dataset))
print(f"Input:{context[idx].numpy().decode('utf-8')}")
print(f"Target:{target[idx].numpy().decode('utf-8')}")

Input:Tom didn't come home last night. I hope he's OK.
Target:Anoche Tom no volvió a casa, espero que esté bien.


In [28]:
def tf_lower_and_split_punct_w_special_tokens(text):
  text = tf.strings.lower(text)
  text = tf.strings.regex_replace(text, '[^ a-z.?!,¿]', '')
  text = tf.strings.regex_replace(text, '[.?!,¿]', r' \0 ')
  text = tf.strings.strip(text)

  text = tf.strings.join(['[START]', text, '[END]'], separator=' ')
  return text

def tf_lower_and_split_punct(text):
  text = tf.strings.lower(text)
  text = tf.strings.regex_replace(text, '[^ a-z.?!,¿]', '')
  text = tf.strings.regex_replace(text, '[.?!,¿]', r' \0 ')
  text = tf.strings.strip(text)

  return text

In [29]:
print(target[idx].numpy().decode())
print(tf_lower_and_split_punct(target[idx]).numpy().decode())
print(tf_lower_and_split_punct_w_special_tokens(target[idx]).numpy().decode())

Anoche Tom no volvió a casa, espero que esté bien.
anoche tom no volvi a casa ,  espero que est bien .
[START] anoche tom no volvi a casa ,  espero que est bien . [END]


In [30]:
context_text_processor = tf.keras.layers.TextVectorization(
    standardize=tf_lower_and_split_punct,
    max_tokens=config['max_vocab_size'],
    output_sequence_length = config['max_length'],
    ragged=False)

target_text_processor = tf.keras.layers.TextVectorization(
    standardize=tf_lower_and_split_punct_w_special_tokens,
    max_tokens=config['max_vocab_size'],
    output_sequence_length = config['max_length'] + 1,
    ragged=False)

In [31]:
context_text_processor.adapt(train_dataset.map(lambda context, target: context))

print(context_text_processor.get_vocabulary()[:10])


target_text_processor.adapt(train_dataset.map(lambda context, target: target))
target_text_processor.get_vocabulary()[:10]


['', '[UNK]', '.', 'i', 'the', 'to', 'you', 'tom', '?', 'a']


['', '[UNK]', '[START]', '[END]', '.', 'de', 'no', 'tom', 'a', 'que']

In [32]:
def process_text(context, target):
  context = context_text_processor(context)
  target = target_text_processor(target)
  targ_in = target[:,:-1]
  targ_out = target[:,1:]
  return (context, targ_in), targ_out


train_dataset = train_dataset.map(process_text, num_parallel_calls=tf.data.AUTOTUNE)
val_dataset = val_dataset.map(process_text, num_parallel_calls = tf.data.AUTOTUNE)

In [33]:
for (ex_context_tok, ex_tar_in), ex_tar_out in val_dataset.take(1):
  print(ex_context_tok[0, :10].numpy()) 
  print(ex_context_tok.shape)
  print(ex_tar_in[0, :10].numpy()) 
  print(ex_tar_out[0, :10].numpy())
  print(ex_tar_out.shape)

[  18   10    4  297 4406   19    3  212  179    2]
(64, 50)
[   2   46 4227   14 1118   37    9  602  145    4]
[  46 4227   14 1118   37    9  602  145    4    3]
(64, 50)
