In [1]:
import os
#Legacy TensorFlow BackEnd
os.environ['TF_USE_LEGACY_KERAS'] = '1'

In [35]:
!pip install sacrebleu --quiet
!pip install evaluate --quiet

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[?25h

In [36]:
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import pathlib
import tensorflow_text as tf_text
from tqdm import tqdm
import sacrebleu
import pickle
from transformers import AutoTokenizer, TFAutoModelForSeq2SeqLM
from transformers import DataCollatorForSeq2Seq
from datasets import Dataset
import evaluate

In [4]:
config = {
    "max_vocab_size":5000,
    "max_length":50,
    "batch_size":128
}


GLOBAL_BATCH = config['batch_size']

In [5]:
path_to_zip = tf.keras.utils.get_file(
    'spa-eng.zip', origin='http://storage.googleapis.com/download.tensorflow.org/data/spa-eng.zip',
    extract=True)

path_to_file = pathlib.Path(path_to_zip).parent/'spa-eng/spa.txt'

Downloading data from http://storage.googleapis.com/download.tensorflow.org/data/spa-eng.zip


In [6]:
def load_data(path):
  text = path.read_text(encoding='utf-8')

  lines = text.splitlines()
  pairs = [line.split('\t') for line in lines]

  context = np.array([context for target, context in pairs])
  target = np.array([target for target, context in pairs])

  return target, context


context_raw,target_raw = load_data(path_to_file)

In [7]:
split_idx = int(0.9 * len(target_raw))

X_train = context_raw[:split_idx]
y_train = target_raw[:split_idx]

X_val = context_raw[split_idx:]
y_val = target_raw[split_idx:]

BUFFER_SIZE = len(X_train)

train_dataset = tf.data.Dataset.from_tensor_slices((X_train, y_train))
train_dataset = train_dataset.shuffle(BUFFER_SIZE).batch(GLOBAL_BATCH, drop_remainder=True).prefetch(tf.data.AUTOTUNE)

# Validation dataset (no shuffle needed)
val_dataset = tf.data.Dataset.from_tensor_slices((X_val, y_val))
val_dataset = val_dataset.batch(GLOBAL_BATCH, drop_remainder=True).prefetch(tf.data.AUTOTUNE)

I0000 00:00:1766166852.144138      55 gpu_device.cc:2019] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 13942 MB memory:  -> device: 0, name: Tesla T4, pci bus id: 0000:00:04.0, compute capability: 7.5
I0000 00:00:1766166852.148025      55 gpu_device.cc:2019] Created device /job:localhost/replica:0/task:0/device:GPU:1 with 13942 MB memory:  -> device: 1, name: Tesla T4, pci bus id: 0000:00:05.0, compute capability: 7.5


In [8]:
def tf_lower_and_split_punct_w_special_tokens(text):
  text = tf_text.normalize_utf8(text, 'NFKD')
  text = tf.strings.lower(text)
  text = tf.strings.regex_replace(text, '[^ a-z.?!,¿]', '')
  text = tf.strings.regex_replace(text, '[.?!,¿]', r' \0 ')
  text = tf.strings.strip(text)

  text = tf.strings.join(['[START]', text, '[END]'], separator=' ')
  return text

def tf_lower_and_split_punct(text):
  text = tf_text.normalize_utf8(text, 'NFKD')
  text = tf.strings.lower(text)
  text = tf.strings.regex_replace(text, '[^ a-z.?!,¿]', '')
  text = tf.strings.regex_replace(text, '[.?!,¿]', r' \0 ')
  text = tf.strings.strip(text)

  return text

In [12]:
with open('/kaggle/input/vocab/tensorflow2/default/1/context_vocab.pkl', 'rb') as f:
    context_vocab = pickle.load(f)

with open('/kaggle/input/vocab/tensorflow2/default/1/target_vocab.pkl', 'rb') as f:
    target_vocab = pickle.load(f)

In [13]:
context_text_processor = tf.keras.layers.TextVectorization(
    standardize = tf_lower_and_split_punct,
    max_tokens = config['max_vocab_size'],
    output_sequence_length = config['max_length'],
    vocabulary = context_vocab,
    ragged = False)

target_text_processor = tf.keras.layers.TextVectorization(
    standardize = tf_lower_and_split_punct_w_special_tokens,
    max_tokens = config['max_vocab_size'],
    output_sequence_length = config['max_length'] + 1,
    vocabulary = target_vocab,
    ragged = False)

In [14]:
# context_text_processor.adapt(train_dataset.map(lambda context, target: context))

print(context_text_processor.get_vocabulary()[:10])


# target_text_processor.adapt(train_dataset.map(lambda context, target: target))
target_text_processor.get_vocabulary()[:10]

['', '[UNK]', np.str_('.'), np.str_('i'), np.str_('the'), np.str_('to'), np.str_('you'), np.str_('tom'), np.str_('?'), np.str_('a')]


['',
 '[UNK]',
 np.str_('[START]'),
 np.str_('[END]'),
 np.str_('.'),
 np.str_('que'),
 np.str_('el'),
 np.str_('de'),
 np.str_('no'),
 np.str_('tom')]

In [15]:
def process_text(context, target):
  context = context_text_processor(context)
  target = target_text_processor(target)
  targ_in = target[:,:-1]
  targ_out = target[:,1:]
  return (context, targ_in), targ_out


train_dataset = train_dataset.map(process_text, num_parallel_calls=tf.data.AUTOTUNE)
val_dataset = val_dataset.map(process_text, num_parallel_calls = tf.data.AUTOTUNE)

In [17]:
rnn = tf.keras.models.load_model("/kaggle/input/translation-weights/tensorflow2/default/1/BI-RNN-Cross-ATT.keras",compile=False)
lstm = tf.keras.models.load_model("/kaggle/input/translation-weights/tensorflow2/default/1/BI-LSTM-Cross-ATT.keras",compile=False)
gru = tf.keras.models.load_model("/kaggle/input/translation-weights/tensorflow2/default/1/BI-GRU-Cross-ATT.keras",compile=False)

In [18]:
def get_initial_state(model, context, target_text_processor):
    if len(context.shape) == 1:
        context = tf.expand_dims(context, 0)
    
    batch_size = tf.shape(context)[0]
    vocab = target_text_processor.get_vocabulary()
    start_token = vocab.index('[START]')
    
    next_token = tf.fill([batch_size, 1], start_token)
    done = tf.zeros([batch_size, 1], dtype=tf.bool)
    
    return next_token, done, context


def get_next_token(model, context, next_token, done, state, target_text_processor, temperature=0.0):
    vocab = target_text_processor.get_vocabulary()
    end_token = vocab.index('[END]')
    
    padded_state = tf.pad(state, [[0, 0], [0, config['max_length'] - tf.shape(state)[1]]])[:, :config['max_length']]
    logits = model.predict([context, padded_state], verbose=0)
    last_logits = logits[:, tf.shape(state)[1] - 1, :]
    
    if temperature == 0.0:
        next_token_id = tf.argmax(last_logits, axis=-1, output_type=tf.int32)
    else:
        next_token_id = tf.squeeze(tf.random.categorical(last_logits / temperature, 1, dtype=tf.int32), -1)
    
    next_token_id = tf.expand_dims(next_token_id, -1)
    done = done | (next_token_id == end_token)
    next_token_id = tf.where(done, tf.constant(0, dtype=tf.int32), next_token_id)
    new_state = tf.concat([state, next_token_id], axis=1)
    
    return next_token_id, done, new_state


def translate(model, spanish_text, target_text_processor, temperature=0.0):
    if len(spanish_text.shape) == 1:
        spanish_text = tf.expand_dims(spanish_text, 0)
    
    next_token, done, context = get_initial_state(model, spanish_text, target_text_processor)
    state = next_token
    tokens = []
    
    for n in range(config['max_length']):
        next_token, done, state = get_next_token(model, context, next_token, done, state, target_text_processor, temperature)
        tokens.append(next_token)
        if tf.reduce_all(done):
            break
    
    tokens = tf.concat(tokens, axis=-1)
    vocab = target_text_processor.get_vocabulary()
    
    words = []
    for token_id in tokens[0].numpy():
        if token_id == 0:
            break
        word = vocab[token_id]
        if word == '[END]':
            break
        if word not in ['[START]', '[UNK]', '']:
            words.append(word)
    
    return ' '.join(words)


def compare_translations(model, spanish_input, target_out, context_text_processor, target_text_processor, n=5):
    spanish_vocab = context_text_processor.get_vocabulary()
    english_vocab = target_text_processor.get_vocabulary()
    
    for i in range(min(n, spanish_input.shape[0])):
        # Spanish input
        sp_words = [spanish_vocab[t] for t in spanish_input[i].numpy() if t > 0]
        spanish = ' '.join(sp_words)
        
        # Ground truth English
        gt_words = [english_vocab[t] for t in target_out[i].numpy() 
                    if t > 0 and english_vocab[t] not in ['[START]', '[END]']]
        ground_truth = ' '.join(gt_words)
        
        # Model translation
        model_output = translate(model, spanish_input[i], target_text_processor)
        
        print(f"\n{i+1}. English: {spanish}")
        print(f"   GROUNDTRUTH: {ground_truth}")
        print(f"   TRANSLATION: {model_output}")


def translate_batch(model, spanish_batch, target_text_processor, temperature=0.0):
    """Translate a batch of sequences."""
    vocab = target_text_processor.get_vocabulary()
    start_token = vocab.index('[START]')
    end_token = vocab.index('[END]')
    
    batch_size = tf.shape(spanish_batch)[0]
    state = tf.fill([batch_size, 1], start_token)
    done = tf.zeros([batch_size], dtype=tf.bool)
    
    for _ in range(config['max_length']):
        padded_state = tf.pad(state, [[0, 0], [0, config['max_length'] - tf.shape(state)[1]]])[:, :config['max_length']]
        logits = model.predict([spanish_batch, padded_state], verbose=0)
        last_logits = logits[:, tf.shape(state)[1] - 1, :]
        
        if temperature == 0.0:
            next_token_id = tf.argmax(last_logits, axis=-1, output_type=tf.int32)
        else:
            next_token_id = tf.squeeze(tf.random.categorical(last_logits / temperature, 1, dtype=tf.int32), -1)
        
        next_token_id = tf.expand_dims(next_token_id, -1)
        done = done | tf.squeeze(next_token_id == end_token, -1)
        next_token_id = tf.where(tf.expand_dims(done, -1), 0, next_token_id)
        state = tf.concat([state, next_token_id], axis=1)
        
        if tf.reduce_all(done):
            break
    
    # Convert to sentences
    results = []
    for i in range(batch_size):
        words = []
        for token_id in state[i].numpy():
            if token_id == 0 or token_id == end_token:
                break
            word = vocab[token_id]
            if word not in ['[START]', '[END]', '[UNK]', '']:
                words.append(word)
        results.append(' '.join(words))
    
    return results

In [19]:
refs = []
hyps = []
vocab = target_text_processor.get_vocabulary()

for (ex_context_tok, ex_tar_in), ex_tar_out in tqdm(val_dataset, desc="Calculating BLEU"):
    # Translate entire batch at once
    preds = translate_batch(rnn, ex_context_tok, target_text_processor)
    hyps.extend(preds)
    
    # Get references
    for i in range(ex_context_tok.shape[0]):
        ref_tokens = [vocab[t] for t in ex_tar_out[i].numpy() 
                     if t > 0 and vocab[t] not in ['[START]', '[END]']]
        refs.append([' '.join(ref_tokens)])

# Calculate BLEU
bleu_rnn = sacrebleu.corpus_bleu(hyps, list(zip(*refs)))
print(f"BLEU: {bleu_rnn.score:.2f}")

Calculating BLEU: 100%|██████████| 92/92 [08:18<00:00,  5.42s/it]
That's 100 lines that end in a tokenized period ('.')
It looks like you forgot to detokenize your test data, which may hurt your score.
If you insist your data is detokenized, or don't care, you can suppress this message with the `force` parameter.


BLEU: 20.67


In [20]:
refs = []
hyps = []
vocab = target_text_processor.get_vocabulary()

for (ex_context_tok, ex_tar_in), ex_tar_out in tqdm(val_dataset, desc="Calculating BLEU"):
    # Translate entire batch at once
    preds = translate_batch(lstm, ex_context_tok, target_text_processor)
    hyps.extend(preds)
    
    # Get references
    for i in range(ex_context_tok.shape[0]):
        ref_tokens = [vocab[t] for t in ex_tar_out[i].numpy() 
                     if t > 0 and vocab[t] not in ['[START]', '[END]']]
        refs.append([' '.join(ref_tokens)])

# Calculate BLEU
bleu_lstm = sacrebleu.corpus_bleu(hyps, list(zip(*refs)))
print(f"BLEU: {bleu_lstm.score:.2f}")

Calculating BLEU:   0%|          | 0/92 [00:00<?, ?it/s]I0000 00:00:1766167464.148323     132 cuda_dnn.cc:529] Loaded cuDNN version 91002
Calculating BLEU: 100%|██████████| 92/92 [08:19<00:00,  5.43s/it]
That's 100 lines that end in a tokenized period ('.')
It looks like you forgot to detokenize your test data, which may hurt your score.
If you insist your data is detokenized, or don't care, you can suppress this message with the `force` parameter.


BLEU: 21.47


In [21]:
refs = []
hyps = []
vocab = target_text_processor.get_vocabulary()

for (ex_context_tok, ex_tar_in), ex_tar_out in tqdm(val_dataset, desc="Calculating BLEU"):
    # Translate entire batch at once
    preds = translate_batch(gru, ex_context_tok, target_text_processor)
    hyps.extend(preds)
    
    # Get references
    for i in range(ex_context_tok.shape[0]):
        ref_tokens = [vocab[t] for t in ex_tar_out[i].numpy() 
                     if t > 0 and vocab[t] not in ['[START]', '[END]']]
        refs.append([' '.join(ref_tokens)])

# Calculate BLEU
bleu_gru = sacrebleu.corpus_bleu(hyps, list(zip(*refs)))
print(f"BLEU: {bleu_gru.score:.2f}")

Calculating BLEU: 100%|██████████| 92/92 [11:18<00:00,  7.38s/it]
That's 100 lines that end in a tokenized period ('.')
It looks like you forgot to detokenize your test data, which may hurt your score.
If you insist your data is detokenized, or don't care, you can suppress this message with the `force` parameter.


BLEU: 24.06


In [23]:
print("\nCreating datasets...")
train_dict = {"source": X_train, "target": y_train}
val_dict = {"source": X_val, "target": y_val}


Creating datasets...


In [26]:
train_dataset = Dataset.from_dict(train_dict)
val_dataset = Dataset.from_dict(val_dict)

In [29]:
tokenizer = AutoTokenizer.from_pretrained("/kaggle/input/transformer/tensorflow2/default/1/my-model")
model = TFAutoModelForSeq2SeqLM.from_pretrained("/kaggle/input/transformer/tensorflow2/default/1/my-model")

TensorFlow and JAX classes are deprecated and will be removed in Transformers v5. We recommend migrating to PyTorch classes or pinning your version of Transformers.
All model checkpoint layers were used when initializing TFMarianMTModel.

All the layers of TFMarianMTModel were initialized from the model checkpoint at /kaggle/input/transformer/tensorflow2/default/1/my-model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFMarianMTModel for predictions without further training.


In [30]:
def preprocess_function(examples):
    inputs = tokenizer(
        examples["source"],
        max_length=config["max_length"],
        truncation=True,
        padding="max_length"
    )

    targets = tokenizer(
        examples["target"],
        max_length=config["max_length"],
        truncation=True,
        padding="max_length"
    )

    labels = []
    for label_ids in targets["input_ids"]:
        label_ids_copy = label_ids.copy()
        label_ids_copy = [-100 if token_id == tokenizer.pad_token_id else token_id
                         for token_id in label_ids_copy]
        labels.append(label_ids_copy)

    inputs["labels"] = labels
    return inputs


print("Tokenizing datasets...")
train_dataset = train_dataset.map(
    preprocess_function,
    batched=True,
    remove_columns=["source", "target"]
)
val_dataset = val_dataset.map(
    preprocess_function,
    batched=True,
    remove_columns=["source", "target"]
)

Tokenizing datasets...


Map:   0%|          | 0/107067 [00:00<?, ? examples/s]

Map:   0%|          | 0/11897 [00:00<?, ? examples/s]

In [31]:
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(
    tokenizer=tokenizer,
    model=model,
    return_tensors="tf"
)

train_dataset = train_dataset.to_tf_dataset(
    columns=['input_ids', 'attention_mask', 'labels'],
    shuffle=True,
    batch_size= config['batch_size'] ,
    collate_fn=data_collator
).prefetch(tf.data.AUTOTUNE)

val_dataset = val_dataset.to_tf_dataset(
    columns=['input_ids', 'attention_mask', 'labels'],
    shuffle=False,
    batch_size= config['batch_size'],
    collate_fn=data_collator
).prefetch(tf.data.AUTOTUNE)

In [37]:
print("\n" + "="*60)
print("Evaluating with BLEU score...")
print("="*60)

metric = evaluate.load("sacrebleu")


Evaluating with BLEU score...


Downloading builder script: 0.00B [00:00, ?B/s]

In [38]:
def translate(text,tokenizer,model, max_length=50):
    """Translate English text to Spanish"""
    inputs = tokenizer(text, return_tensors="tf", max_length=max_length, truncation=True)
    outputs = model.generate(
        inputs["input_ids"],
        max_length=max_length,
        num_beams=4,
        early_stopping=True
    )
    translation = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return translation

In [48]:
predictions = []
references = []
batch_size = 256

print("Generating translations...")

for i in tqdm(range(0, len(X_val), batch_size), desc="Translating"):
    batch = X_val[i:i+batch_size].tolist()
    
    # Tokenize and generate in one go
    inputs = tokenizer(batch, return_tensors="tf", padding=True, truncation=True, max_length=50)
    outputs = model.generate(inputs["input_ids"], max_length=50, num_beams=4, early_stopping=True)
    
    # Decode all at once
    batch_preds = tokenizer.batch_decode(outputs, skip_special_tokens=True)
    
    predictions.extend(batch_preds)
    references.extend([[ref] for ref in y_val[i:i+batch_size]])

bleu_score = metric.compute(predictions=predictions, references=references)
print(f"\nBLEU Score: {bleu_score['score']:.2f}")

Generating translations...


Translating:   2%|▏         | 1/47 [10:03<7:43:02, 603.97s/it]


KeyboardInterrupt: 