# **Setting up the environment**

In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import numpy as np
import tensorflow as tf

In [3]:
from pathlib import Path

url = "https://storage.googleapis.com/download.tensorflow.org/data/spa-eng.zip"
path = tf.keras.utils.get_file("spa-eng.zip", origin=url, cache_dir="datasets",
                               extract=True)
text = (Path(path).with_name("spa-eng") / "spa.txt").read_text()

Downloading data from https://storage.googleapis.com/download.tensorflow.org/data/spa-eng.zip


In [4]:
np.random.seed(42)

text = text.replace("¡", "").replace("¿", "")
pairs = [line.split("\t") for line in text.splitlines()]
np.random.shuffle(pairs)
sentences_en, sentences_es = zip(*pairs)  # separates the pairs into 2 lists

In [5]:
for i in range(3):
    print(sentences_en[i], "=>", sentences_es[i])

How boring! => Qué aburrimiento!
I love sports. => Adoro el deporte.
Would you like to swap jobs? => Te gustaría que intercambiemos los trabajos?


In [6]:
vocab_size = 1000
max_length = 50
text_vec_layer_en = tf.keras.layers.TextVectorization(
    vocab_size, output_sequence_length=max_length)
text_vec_layer_es = tf.keras.layers.TextVectorization(
    vocab_size, output_sequence_length=max_length)
text_vec_layer_en.adapt(sentences_en)
text_vec_layer_es.adapt([f"startofseq {s} endofseq" for s in sentences_es])

In [7]:
text_vec_layer_en.get_vocabulary()[:10]

['', '[UNK]', 'the', 'i', 'to', 'you', 'tom', 'a', 'is', 'he']

In [8]:
text_vec_layer_es.get_vocabulary()[:10]

['', '[UNK]', 'startofseq', 'endofseq', 'de', 'que', 'a', 'no', 'tom', 'la']

In [9]:
X_train = tf.constant(sentences_en[:100_000])
X_valid = tf.constant(sentences_en[100_000:])
X_train_dec = tf.constant([f"startofseq {s}" for s in sentences_es[:100_000]])
X_valid_dec = tf.constant([f"startofseq {s}" for s in sentences_es[100_000:]])
Y_train = text_vec_layer_es([f"{s} endofseq" for s in sentences_es[:100_000]])
Y_valid = text_vec_layer_es([f"{s} endofseq" for s in sentences_es[100_000:]])

In [10]:
tf.random.set_seed(42) 
encoder_inputs = tf.keras.layers.Input(shape=[], dtype=tf.string)
decoder_inputs = tf.keras.layers.Input(shape=[], dtype=tf.string)

In [11]:
embed_size = 128
encoder_input_ids = text_vec_layer_en(encoder_inputs)
decoder_input_ids = text_vec_layer_es(decoder_inputs)
encoder_embedding_layer = tf.keras.layers.Embedding(vocab_size, embed_size,
                                                    mask_zero=True)
decoder_embedding_layer = tf.keras.layers.Embedding(vocab_size, embed_size,
                                                    mask_zero=True)
encoder_embeddings = encoder_embedding_layer(encoder_input_ids)
decoder_embeddings = decoder_embedding_layer(decoder_input_ids)

# **Bidirectional RNNs - Sequence to Sequence Model**

In [12]:
tf.random.set_seed(42)  # extra code – ensures reproducibility on CPU
encoder = tf.keras.layers.Bidirectional(
    tf.keras.layers.LSTM(256, return_state=True))

In [13]:
encoder_outputs, *encoder_state = encoder(encoder_embeddings)
encoder_state = [tf.concat(encoder_state[::2], axis=-1),  # short-term (0 & 2)
                 tf.concat(encoder_state[1::2], axis=-1)]  # long-term (1 & 3)

In [14]:
decoder = tf.keras.layers.LSTM(512, return_sequences=True)
decoder_outputs = decoder(decoder_embeddings, initial_state=encoder_state)
output_layer = tf.keras.layers.Dense(vocab_size, activation="softmax")
Y_proba = output_layer(decoder_outputs)
model = tf.keras.Model(inputs=[encoder_inputs, decoder_inputs],
                       outputs=[Y_proba])
model.compile(loss="sparse_categorical_crossentropy", optimizer="nadam",
              metrics=["accuracy"])
model.fit((X_train, X_train_dec), Y_train, epochs=10,
          validation_data=((X_valid, X_valid_dec), Y_valid))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7c579dd45930>

# **Greedy Decoding**

In [15]:
def translate(sentence_en):
    translation = ""
    for word_idx in range(max_length):
        X = np.array([sentence_en])  # encoder input 
        X_dec = np.array(["startofseq " + translation])  # decoder input
        y_proba = model.predict((X, X_dec))[0, word_idx]  # last token's probas
        predicted_word_id = np.argmax(y_proba)
        predicted_word = text_vec_layer_es.get_vocabulary()[predicted_word_id]
        if predicted_word == "endofseq":
            break
        translation += " " + predicted_word
    return translation.strip()

In [16]:
translate("I like soccer")



'me gusta el fútbol'

In [17]:
translate("I like soccer and also going to the beach")



'me gusta [UNK] ella antes de ir a la playa'

# **Beam Search**

In [18]:
def beam_search(sentence_en, beam_width, verbose=False):
    X = np.array([sentence_en])  # encoder input
    X_dec = np.array(["startofseq"])  # decoder input
    y_proba = model.predict((X, X_dec))[0, 0]  # first token's probas
    top_k = tf.math.top_k(y_proba, k=beam_width)
    top_translations = [  # list of best (log_proba, translation)
        (np.log(word_proba), text_vec_layer_es.get_vocabulary()[word_id])
        for word_proba, word_id in zip(top_k.values, top_k.indices)
    ]
    
    # extra code – displays the top first words in verbose mode
    if verbose:
        print("Top first words:", top_translations)

    for idx in range(1, max_length):
        candidates = []
        for log_proba, translation in top_translations:
            if translation.endswith("endofseq"):
                candidates.append((log_proba, translation))
                continue  # translation is finished, so don't try to extend it
            X = np.array([sentence_en])  # encoder input
            X_dec = np.array(["startofseq " + translation])  # decoder input
            y_proba = model.predict((X, X_dec))[0, idx]  # last token's proba
            for word_id, word_proba in enumerate(y_proba):
                word = text_vec_layer_es.get_vocabulary()[word_id]
                candidates.append((log_proba + np.log(word_proba),
                                   f"{translation} {word}"))
        top_translations = sorted(candidates, reverse=True)[:beam_width]

        # extra code – displays the top translation so far in verbose mode
        if verbose:
            print("Top translations so far:", top_translations)

        if all([tr.endswith("endofseq") for _, tr in top_translations]):
            return top_translations[0][1].replace("endofseq", "").strip()

In [19]:
sentence_en = "I like soccer and also going to the beach"
translate(sentence_en)



'me gusta [UNK] ella antes de ir a la playa'

In [20]:
beam_search(sentence_en, beam_width=3, verbose=True)

Top first words: [(-0.0048767435, 'me'), (-5.995707, 'a'), (-7.161818, 'se')]
Top translations so far: [(-0.021505108, 'me gusta'), (-4.168967, 'me gustan'), (-6.6700883, 'a mí')]
Top translations so far: [(-1.4461229, 'me gusta [UNK]'), (-1.7645427, 'me gusta la'), (-1.8998783, 'me gusta que')]
Top translations so far: [(-2.4531894, 'me gusta la estación'), (-2.9508576, 'me gusta la chica'), (-2.967947, 'me gusta que [UNK]')]
Top translations so far: [(-2.8418884, 'me gusta la estación y'), (-3.7337677, 'me gusta la chica que'), (-3.9009368, 'me gusta que [UNK] [UNK]')]
Top translations so far: [(-3.4586139, 'me gusta la estación y me'), (-4.0285234, 'me gusta que [UNK] [UNK] a'), (-4.175418, 'me gusta la estación y a')]
Top translations so far: [(-3.5529494, 'me gusta la estación y me gusta'), (-4.070143, 'me gusta que [UNK] [UNK] a la'), (-5.2871118, 'me gusta la estación y a mi')]
Top translations so far: [(-4.13051, 'me gusta que [UNK] [UNK] a la playa'), (-4.464407, 'me gusta la 

'me gusta que [UNK] [UNK] a la playa'

# **Sampling (Nucleus Sampling, Temperature Sampling, Top-k Sampling)**

In [21]:
import random

# Utility to convert logits to token
def sample_token_from_probs(probs, temperature=1.0):
    probs = np.asarray(probs)
    if temperature != 1.0:
        probs = np.log(probs + 1e-9) / temperature
        probs = np.exp(probs) / np.sum(np.exp(probs))
    return np.random.choice(len(probs), p=probs)

# Top-k Sampling
def top_k_sampling(sentence_en, k=10, temperature=1.0):
    translation = ""
    for idx in range(max_length):
        X = np.array([sentence_en])
        X_dec = np.array(["startofseq " + translation])
        y_proba = model.predict((X, X_dec))[0, idx]
        
        top_k_indices = np.argsort(y_proba)[-k:]
        top_k_probs = y_proba[top_k_indices]
        top_k_probs = top_k_probs / np.sum(top_k_probs)  # Normalize
        sampled_index = np.random.choice(top_k_indices, p=top_k_probs)
        
        word = text_vec_layer_es.get_vocabulary()[sampled_index]
        if word == "endofseq":
            break
        translation += " " + word
    return translation.strip()

# Nucleus (Top-p) Sampling
def nucleus_sampling(sentence_en, p=0.9, temperature=1.0):
    translation = ""
    for idx in range(max_length):
        X = np.array([sentence_en])
        X_dec = np.array(["startofseq " + translation])
        y_proba = model.predict((X, X_dec))[0, idx]
        
        sorted_indices = np.argsort(y_proba)[::-1]
        sorted_probs = y_proba[sorted_indices]
        cumulative_probs = np.cumsum(sorted_probs)
        
        cutoff = np.where(cumulative_probs > p)[0][0] + 1
        top_p_indices = sorted_indices[:cutoff]
        top_p_probs = y_proba[top_p_indices]
        top_p_probs = top_p_probs / np.sum(top_p_probs)
        
        sampled_index = np.random.choice(top_p_indices, p=top_p_probs)
        word = text_vec_layer_es.get_vocabulary()[sampled_index]
        if word == "endofseq":
            break
        translation += " " + word
    return translation.strip()

# Temperature Sampling
def temperature_sampling(sentence_en, temperature=1.0):
    translation = ""
    for idx in range(max_length):
        X = np.array([sentence_en])
        X_dec = np.array(["startofseq " + translation])
        y_proba = model.predict((X, X_dec))[0, idx]
        
        sampled_index = sample_token_from_probs(y_proba, temperature)
        word = text_vec_layer_es.get_vocabulary()[sampled_index]
        if word == "endofseq":
            break
        translation += " " + word
    return translation.strip()


In [22]:
sentence = "I like soccer and also going to the beach"

print("Top-k Sampling:", top_k_sampling(sentence, k=10))
print("Nucleus Sampling:", nucleus_sampling(sentence, p=0.9))
print("Temperature Sampling (0.7):", temperature_sampling(sentence, temperature=0.7))


Top-k Sampling: me gusta como a la chica a la playa
Nucleus Sampling: me gusta como la estación antes de ir a la playa
Temperature Sampling (0.7): me gusta la chica cuando estoy al fútbol


In [23]:
def temperature_sampling_range(sentence_en, temperatures=[0.4, 0.5, 0.6, 0.7, 0.8]):
    results = {}
    for temp in temperatures:
        translation = ""
        for idx in range(max_length):
            X = np.array([sentence_en])
            X_dec = np.array(["startofseq " + translation])
            y_proba = model.predict((X, X_dec))[0, idx]

            # Apply temperature scaling
            scaled_logits = np.log(y_proba + 1e-9) / temp
            scaled_probs = np.exp(scaled_logits)
            scaled_probs /= np.sum(scaled_probs)

            sampled_index = np.random.choice(len(scaled_probs), p=scaled_probs)
            word = text_vec_layer_es.get_vocabulary()[sampled_index]
            if word == "endofseq":
                break
            translation += " " + word

        results[temp] = translation.strip()
    return results


In [24]:
translations = temperature_sampling_range(sentence, temperatures=[0.4, 0.5, 0.6, 0.7, 0.8])
for temp, result in translations.items():
    print(f"Temperature {temp}: {result}")

Temperature 0.4: me gusta [UNK] a ella y a la playa
Temperature 0.5: me gusta cuando [UNK] a la playa
Temperature 0.6: me gusta que ella [UNK] a la playa a la playa
Temperature 0.7: me gusta [UNK] antes de ir a la playa
Temperature 0.8: me gusta que me gusta ella y a la playa
