In [1]:
import os
#Legacy TensorFlow BackEnd
os.environ['TF_USE_LEGACY_KERAS'] = '1'

In [2]:
!pip install sacrebleu --quiet


[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.8/51.8 kB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m104.1/104.1 kB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
[?25h

In [17]:
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import pathlib
import tensorflow_text as tf_text
from tqdm import tqdm
import sacrebleu
import pickle

In [4]:
strategy = tf.distribute.MirroredStrategy()

INFO:tensorflow:Using MirroredStrategy with devices ('/job:localhost/replica:0/task:0/device:GPU:0', '/job:localhost/replica:0/task:0/device:GPU:1')


I0000 00:00:1766161415.104204      55 gpu_device.cc:2019] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 13942 MB memory:  -> device: 0, name: Tesla T4, pci bus id: 0000:00:04.0, compute capability: 7.5
I0000 00:00:1766161415.104901      55 gpu_device.cc:2019] Created device /job:localhost/replica:0/task:0/device:GPU:1 with 13942 MB memory:  -> device: 1, name: Tesla T4, pci bus id: 0000:00:05.0, compute capability: 7.5


In [5]:
print('Number of devices: {}'.format(strategy.num_replicas_in_sync))

Number of devices: 2


In [6]:
config = {
    "learning_rate": 1e-4,
    "batch_size": 256,
    "epochs": 20,
    "max_vocab_size":5000,
    "max_length":50
}

GLOBAL_BATCH = config['batch_size'] * strategy.num_replicas_in_sync
LEARNING_RATE = config['learning_rate']
EPOCHS = config['epochs']

In [7]:
path_to_zip = tf.keras.utils.get_file(
    'spa-eng.zip', origin='http://storage.googleapis.com/download.tensorflow.org/data/spa-eng.zip',
    extract=True)

path_to_file = pathlib.Path(path_to_zip).parent/'spa-eng/spa.txt'

Downloading data from http://storage.googleapis.com/download.tensorflow.org/data/spa-eng.zip


In [8]:
def load_data(path):
  text = path.read_text(encoding='utf-8')

  lines = text.splitlines()
  pairs = [line.split('\t') for line in lines]

  context = np.array([context for target, context in pairs])
  target = np.array([target for target, context in pairs])

  return target, context

In [9]:
context_raw,target_raw = load_data(path_to_file)
print(context_raw[-1])

If you want to sound like a native speaker, you must be willing to practice saying the same sentence over and over in the same way that banjo players practice the same phrase over and over until they can play it correctly and at the desired tempo.


In [10]:
print(target_raw[-1])

Si quieres sonar como un hablante nativo, debes estar dispuesto a practicar diciendo la misma frase una y otra vez de la misma manera en que un músico de banjo practica el mismo fraseo una y otra vez hasta que lo puedan tocar correctamente y en el tiempo esperado.


In [11]:
split_idx = int(0.9 * len(target_raw))

X_train = context_raw[:split_idx]
y_train = target_raw[:split_idx]

X_val = context_raw[split_idx:]
y_val = target_raw[split_idx:]

BUFFER_SIZE = len(X_train)

train_dataset = tf.data.Dataset.from_tensor_slices((X_train, y_train))
train_dataset = train_dataset.shuffle(BUFFER_SIZE).batch(GLOBAL_BATCH, drop_remainder=True).prefetch(tf.data.AUTOTUNE)

# Validation dataset (no shuffle needed)
val_dataset = tf.data.Dataset.from_tensor_slices((X_val, y_val))
val_dataset = val_dataset.batch(GLOBAL_BATCH, drop_remainder=True).prefetch(tf.data.AUTOTUNE)

In [12]:
TRAIN_STEPS = tf.data.experimental.cardinality(train_dataset).numpy()
VAL_STEPS = tf.data.experimental.cardinality(val_dataset).numpy()

print(TRAIN_STEPS, VAL_STEPS)

209 23


In [13]:
idx = np.random.randint(0,63)
context,target = next(iter(val_dataset))
print(f"Input:{context[idx].numpy().decode('utf-8')}")
print(f"Target:{target[idx].numpy().decode('utf-8')}")

Input:Tom and Mary usually speak French to each other.
Target:Tom y María normalmente hablan francés entre ellos.


In [14]:
def tf_lower_and_split_punct_w_special_tokens(text):
  text = tf_text.normalize_utf8(text, 'NFKD')
  text = tf.strings.lower(text)
  text = tf.strings.regex_replace(text, '[^ a-z.?!,¿]', '')
  text = tf.strings.regex_replace(text, '[.?!,¿]', r' \0 ')
  text = tf.strings.strip(text)

  text = tf.strings.join(['[START]', text, '[END]'], separator=' ')
  return text

def tf_lower_and_split_punct(text):
  text = tf_text.normalize_utf8(text, 'NFKD')
  text = tf.strings.lower(text)
  text = tf.strings.regex_replace(text, '[^ a-z.?!,¿]', '')
  text = tf.strings.regex_replace(text, '[.?!,¿]', r' \0 ')
  text = tf.strings.strip(text)

  return text

In [15]:
print(target[idx].numpy().decode())
print(tf_lower_and_split_punct(target[idx]).numpy().decode())
print(tf_lower_and_split_punct_w_special_tokens(target[idx]).numpy().decode())

Tom y María normalmente hablan francés entre ellos.
tom y maria normalmente hablan frances entre ellos .
[START] tom y maria normalmente hablan frances entre ellos . [END]


In [18]:
with open('/kaggle/input/vocab/tensorflow2/default/1/context_vocab.pkl', 'rb') as f:
    context_vocab = pickle.load(f)

with open('/kaggle/input/vocab/tensorflow2/default/1/target_vocab.pkl', 'rb') as f:
    target_vocab = pickle.load(f)

In [19]:
context_text_processor = tf.keras.layers.TextVectorization(
    standardize = tf_lower_and_split_punct,
    max_tokens = config['max_vocab_size'],
    output_sequence_length = config['max_length'],
    vocabulary = context_vocab,
    ragged = False)

target_text_processor = tf.keras.layers.TextVectorization(
    standardize = tf_lower_and_split_punct_w_special_tokens,
    max_tokens = config['max_vocab_size'],
    output_sequence_length = config['max_length'] + 1,
    vocabulary = target_vocab,
    ragged = False)

In [20]:
# context_text_processor.adapt(train_dataset.map(lambda context, target: context))

print(context_text_processor.get_vocabulary()[:10])


# target_text_processor.adapt(train_dataset.map(lambda context, target: target))
target_text_processor.get_vocabulary()[:10]

['', '[UNK]', np.str_('.'), np.str_('i'), np.str_('the'), np.str_('to'), np.str_('you'), np.str_('tom'), np.str_('?'), np.str_('a')]


['',
 '[UNK]',
 np.str_('[START]'),
 np.str_('[END]'),
 np.str_('.'),
 np.str_('que'),
 np.str_('el'),
 np.str_('de'),
 np.str_('no'),
 np.str_('tom')]

In [21]:
def process_text(context, target):
  context = context_text_processor(context)
  target = target_text_processor(target)
  targ_in = target[:,:-1]
  targ_out = target[:,1:]
  return (context, targ_in), targ_out


train_dataset = train_dataset.map(process_text, num_parallel_calls=tf.data.AUTOTUNE)
val_dataset = val_dataset.map(process_text, num_parallel_calls = tf.data.AUTOTUNE)

In [22]:
batch = next(iter(val_dataset))
replica_batch = strategy.experimental_local_results(batch)[0]
(ex_context_tok, ex_tar_in), ex_tar_out = replica_batch
print(ex_context_tok[0, :10].numpy()) 
print(ex_context_tok.shape)
print(ex_tar_in[0, :10].numpy()) 
print(ex_tar_out[0, :10].numpy())
print(ex_tar_out.shape)

[  18   10    4  297 4408   19    3  212  179    2]
(512, 50)
[   2   42 4238   14 1118   36    5  592  137    4]
[  42 4238   14 1118   36    5  592  137    4    3]
(512, 50)


In [23]:
def Build_Seq2Seq(
    max_length=157,
    vocab_size_en=10000,
    vocab_size_es=10000,
    embedding_dim=256,
    units=256
):
    # ================= Encoder =================
    encoder_input = tf.keras.layers.Input(
        shape=(max_length,), dtype="int32", name="encoder_input"
    )

    enc_emb = tf.keras.layers.Embedding(
        vocab_size_en, embedding_dim, mask_zero=True
    )(encoder_input)
    enc_emb = tf.keras.layers.Dropout(0.2)(enc_emb)

    encoder = tf.keras.layers.Bidirectional(
        tf.keras.layers.GRU(
            units,
            return_sequences=True,
            return_state=True,
            name="encoder_gru"
        )
    )

    encoder_outputs, forward_h, backward_h = encoder(enc_emb)

    # Concatenate forward + backward hidden states
    encoder_state_h = tf.keras.layers.Concatenate(axis=-1)(
        [forward_h, backward_h]
    )  # shape = (batch, units*2)

    # ================= Decoder =================
    decoder_input = tf.keras.layers.Input(
        shape=(max_length,), dtype="int32", name="decoder_input"
    )

    dec_emb = tf.keras.layers.Embedding(
        vocab_size_es, embedding_dim, mask_zero=True
    )(decoder_input)
    dec_emb = tf.keras.layers.Dropout(0.2)(dec_emb)

    decoder_outputs = tf.keras.layers.GRU(
        units * 2,                 # MUST match encoder_state_h
        return_sequences=True,
        name="decoder_gru"
    )(dec_emb, initial_state=[encoder_state_h])

    # ================= Cross Attention =================
    attention_output = tf.keras.layers.MultiHeadAttention(
        num_heads=8,
        key_dim=units * 2,
        name="cross_attention"
    )(
        query=decoder_outputs,
        value=encoder_outputs,
        key=encoder_outputs
    )

    # ================= Output =================
    decoder_combined = tf.keras.layers.Concatenate(axis=-1)(
        [decoder_outputs, attention_output]
    )

    outputs = tf.keras.layers.Dense(
        vocab_size_es, activation="softmax", name="output_dense"
    )(decoder_combined)

    model = tf.keras.Model(
        inputs=[encoder_input, decoder_input],
        outputs=outputs,
        name="seq2seq_gru"
    )

    return model


vocab_size_en = context_text_processor.vocabulary_size()
vocab_size_fr = target_text_processor.vocabulary_size()
print(f"English vocab size: {vocab_size_en}")
print(f"French vocab size: {vocab_size_fr}")

with strategy.scope():
    model = Build_Seq2Seq(
        max_length=config['max_length'], 
        vocab_size_en=vocab_size_en,
        vocab_size_es=vocab_size_fr,
        embedding_dim=256,
        units=256  
    )


English vocab size: 5000
French vocab size: 5000


In [24]:
model.summary()

Model: "seq2seq_gru"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 encoder_input (InputLayer)  [(None, 50)]                 0         []                            
                                                                                                  
 embedding (Embedding)       (None, 50, 256)              1280000   ['encoder_input[0][0]']       
                                                                                                  
 decoder_input (InputLayer)  [(None, 50)]                 0         []                            
                                                                                                  
 dropout (Dropout)           (None, 50, 256)              0         ['embedding[0][0]']           
                                                                                        

In [25]:
def masked_loss(y_true, y_pred):
    
    loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(
        from_logits=False,
        reduction='none'
    )
    loss = loss_fn(y_true, y_pred) 

    mask = tf.cast(y_true != 0, loss.dtype)
    loss *= mask

    return tf.reduce_sum(loss) / tf.reduce_sum(mask)


def masked_acc(y_true, y_pred):
    
    y_pred = tf.argmax(y_pred, axis=-1)
    y_pred = tf.cast(y_pred, y_true.dtype)

    match = tf.cast(y_true == y_pred, tf.float32)
    
    mask = tf.cast(y_true != 0, tf.float32)

    return tf.reduce_sum(match) / tf.reduce_sum(mask)

In [26]:
with strategy.scope():
    model.compile(optimizer='adam',
                  loss=masked_loss, 
                  metrics=[masked_acc])

In [27]:
history = model.fit(
    train_dataset, 
    epochs=30,
    validation_data=val_dataset,
    callbacks=[
        tf.keras.callbacks.EarlyStopping(
            monitor='val_loss',
            patience=5,
            restore_best_weights=True
        ),
        tf.keras.callbacks.ReduceLROnPlateau(
            monitor='val_loss',
            factor=0.1,          
            patience=3,           
            min_lr=1e-7,          
            verbose=1             
        )
    ]
)

Epoch 1/30
INFO:tensorflow:Collective all_reduce tensors: 19 all_reduces, num_devices = 2, group_size = 2, implementation = CommunicationImplementation.NCCL, num_packs = 1
INFO:tensorflow:Collective all_reduce IndexedSlices: 2 all_reduces, num_devices =2, group_size = 2, implementation = CommunicationImplementation.NCCL
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Collective all_reduce tensors: 19 all_reduces, num_devices = 2, group_size = 2, implementation = Commu

I0000 00:00:1766161493.279102     132 cuda_dnn.cc:529] Loaded cuDNN version 91002
I0000 00:00:1766161493.279121     129 cuda_dnn.cc:529] Loaded cuDNN version 91002
I0000 00:00:1766161495.383004     131 service.cc:152] XLA service 0x7c5ad38df660 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1766161495.383047     131 service.cc:160]   StreamExecutor device (0): Tesla T4, Compute Capability 7.5
I0000 00:00:1766161495.383056     131 service.cc:160]   StreamExecutor device (1): Tesla T4, Compute Capability 7.5
I0000 00:00:1766161495.772253     131 device_compiler.h:188] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 10: ReduceLROnPlateau reducing learning rate to 0.00010000000474974513.
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 14: ReduceLROnPlateau reducing learning rate to 1.0000000656873453e-05.
Epoch 15/30
Epoch 16/30


In [28]:
model.save("BI-GRU-Cross-ATT.keras")

In [29]:
model.save_weights("BI-GRU-Cross-ATT.h5")

In [30]:
loaded_model = tf.keras.models.load_model("/kaggle/working/BI-GRU-Cross-ATT.keras",compile=False)

In [31]:
def get_initial_state(model, context, target_text_processor):
    if len(context.shape) == 1:
        context = tf.expand_dims(context, 0)
    
    batch_size = tf.shape(context)[0]
    vocab = target_text_processor.get_vocabulary()
    start_token = vocab.index('[START]')
    
    next_token = tf.fill([batch_size, 1], start_token)
    done = tf.zeros([batch_size, 1], dtype=tf.bool)
    
    return next_token, done, context


def get_next_token(model, context, next_token, done, state, target_text_processor, temperature=0.0):
    vocab = target_text_processor.get_vocabulary()
    end_token = vocab.index('[END]')
    
    padded_state = tf.pad(state, [[0, 0], [0, config['max_length'] - tf.shape(state)[1]]])[:, :config['max_length']]
    logits = model.predict([context, padded_state], verbose=0)
    last_logits = logits[:, tf.shape(state)[1] - 1, :]
    
    if temperature == 0.0:
        next_token_id = tf.argmax(last_logits, axis=-1, output_type=tf.int32)
    else:
        next_token_id = tf.squeeze(tf.random.categorical(last_logits / temperature, 1, dtype=tf.int32), -1)
    
    next_token_id = tf.expand_dims(next_token_id, -1)
    done = done | (next_token_id == end_token)
    next_token_id = tf.where(done, tf.constant(0, dtype=tf.int32), next_token_id)
    new_state = tf.concat([state, next_token_id], axis=1)
    
    return next_token_id, done, new_state


def translate(model, spanish_text, target_text_processor, temperature=0.0):
    if len(spanish_text.shape) == 1:
        spanish_text = tf.expand_dims(spanish_text, 0)
    
    next_token, done, context = get_initial_state(model, spanish_text, target_text_processor)
    state = next_token
    tokens = []
    
    for n in range(config['max_length']):
        next_token, done, state = get_next_token(model, context, next_token, done, state, target_text_processor, temperature)
        tokens.append(next_token)
        if tf.reduce_all(done):
            break
    
    tokens = tf.concat(tokens, axis=-1)
    vocab = target_text_processor.get_vocabulary()
    
    words = []
    for token_id in tokens[0].numpy():
        if token_id == 0:
            break
        word = vocab[token_id]
        if word == '[END]':
            break
        if word not in ['[START]', '[UNK]', '']:
            words.append(word)
    
    return ' '.join(words)


def compare_translations(model, spanish_input, target_out, context_text_processor, target_text_processor, n=5):
    spanish_vocab = context_text_processor.get_vocabulary()
    english_vocab = target_text_processor.get_vocabulary()
    
    for i in range(min(n, spanish_input.shape[0])):
        # Spanish input
        sp_words = [spanish_vocab[t] for t in spanish_input[i].numpy() if t > 0]
        spanish = ' '.join(sp_words)
        
        # Ground truth English
        gt_words = [english_vocab[t] for t in target_out[i].numpy() 
                    if t > 0 and english_vocab[t] not in ['[START]', '[END]']]
        ground_truth = ' '.join(gt_words)
        
        # Model translation
        model_output = translate(model, spanish_input[i], target_text_processor)
        
        print(f"\n{i+1}. English: {spanish}")
        print(f"   GROUNDTRUTH: {ground_truth}")
        print(f"   TRANSLATION: {model_output}")


# Create distributed dataset + iterator
dist_val_dataset = strategy.experimental_distribute_dataset(val_dataset)
dist_val_iter = iter(dist_val_dataset)

# ===== FIRST BATCH =====
(ex_context_tok, ex_tar_in), ex_tar_out = next(dist_val_iter)
ex_context_tok = strategy.experimental_local_results(ex_context_tok)[0]
ex_tar_out     = strategy.experimental_local_results(ex_tar_out)[0]

compare_translations(
    loaded_model,
    ex_context_tok,
    ex_tar_out,
    context_text_processor,
    target_text_processor,
    n=5
)

# ===== SECOND BATCH =====
(ex_context_tok, ex_tar_in), ex_tar_out = next(dist_val_iter)
ex_context_tok = strategy.experimental_local_results(ex_context_tok)[0]
ex_tar_out     = strategy.experimental_local_results(ex_tar_out)[0]

compare_translations(
    loaded_model,
    ex_context_tok,
    ex_tar_out,
    context_text_processor,
    target_text_processor,
    n=5
)

# ===== THIRD BATCH =====
(ex_context_tok, ex_tar_in), ex_tar_out = next(dist_val_iter)
ex_context_tok = strategy.experimental_local_results(ex_context_tok)[0]
ex_tar_out     = strategy.experimental_local_results(ex_tar_out)[0]

compare_translations(
    loaded_model,
    ex_context_tok,
    ex_tar_out,
    context_text_processor,
    target_text_processor,
    n=5
)


1. English: this is the same necklace that i lost yesterday .
   GROUNDTRUTH: este collar es igual al que perdi ayer .
   TRANSLATION: este es el mismo capaz de que perdi ayer .

2. English: this is the strongest dog that i have ever seen .
   GROUNDTRUTH: este es el perro mas fuerte que jamas haya visto .
   TRANSLATION: este es el perro mas fuerte que haya visto jamas .

3. English: this is your last chance to spend time with tom .
   GROUNDTRUTH: esta es tu ultima oportunidad de pasar tiempo con tom .
   TRANSLATION: esta es tu ultima oportunidad de tom con tom .

4. English: this material will stand up to lots of [UNK] .
   GROUNDTRUTH: este material [UNK] un monton de [UNK] .
   TRANSLATION: este se a muchas .

5. English: this medicine should be taken every three hours .
   GROUNDTRUTH: este medicamento debe ser tomado cada tres horas .
   TRANSLATION: este medicamento deberia haber tomado cada tres horas .

1. English: his salary is double what it was seven years ago .
   GROUN

In [32]:
def translate_batch(model, spanish_batch, target_text_processor, temperature=0.0):
    """Translate a batch of sequences."""
    vocab = target_text_processor.get_vocabulary()
    start_token = vocab.index('[START]')
    end_token = vocab.index('[END]')
    
    batch_size = tf.shape(spanish_batch)[0]
    state = tf.fill([batch_size, 1], start_token)
    done = tf.zeros([batch_size], dtype=tf.bool)
    
    for _ in range(config['max_length']):
        padded_state = tf.pad(state, [[0, 0], [0, config['max_length'] - tf.shape(state)[1]]])[:, :config['max_length']]
        logits = model.predict([spanish_batch, padded_state], verbose=0)
        last_logits = logits[:, tf.shape(state)[1] - 1, :]
        
        if temperature == 0.0:
            next_token_id = tf.argmax(last_logits, axis=-1, output_type=tf.int32)
        else:
            next_token_id = tf.squeeze(tf.random.categorical(last_logits / temperature, 1, dtype=tf.int32), -1)
        
        next_token_id = tf.expand_dims(next_token_id, -1)
        done = done | tf.squeeze(next_token_id == end_token, -1)
        next_token_id = tf.where(tf.expand_dims(done, -1), 0, next_token_id)
        state = tf.concat([state, next_token_id], axis=1)
        
        if tf.reduce_all(done):
            break
    
    # Convert to sentences
    results = []
    for i in range(batch_size):
        words = []
        for token_id in state[i].numpy():
            if token_id == 0 or token_id == end_token:
                break
            word = vocab[token_id]
            if word not in ['[START]', '[END]', '[UNK]', '']:
                words.append(word)
        results.append(' '.join(words))
    
    return results

In [45]:
refs = []
hyps = []
vocab = target_text_processor.get_vocabulary()

for (ex_context_tok, ex_tar_in), ex_tar_out in tqdm(val_dataset, desc="Calculating BLEU"):
    # Translate entire batch at once
    preds = translate_batch(loaded_model, ex_context_tok, target_text_processor)
    hyps.extend(preds)
    
    # Get references
    for i in range(ex_context_tok.shape[0]):
        ref_tokens = [vocab[t] for t in ex_tar_out[i].numpy() 
                     if t > 0 and vocab[t] not in ['[START]', '[END]']]
        refs.append([' '.join(ref_tokens)])

# Calculate BLEU
bleu = sacrebleu.corpus_bleu(hyps, list(zip(*refs)))
print(f"BLEU: {bleu.score:.2f}")

Calculating BLEU: 100%|██████████| 23/23 [09:17<00:00, 24.26s/it]
That's 100 lines that end in a tokenized period ('.')
It looks like you forgot to detokenize your test data, which may hurt your score.
If you insist your data is detokenized, or don't care, you can suppress this message with the `force` parameter.


BLEU: 21.92
