In [1]:
import keras_hub
import random

import keras
from keras import ops

import tensorflow.data as tf_data
import tensorflow as tf
from tensorflow_text.tools.wordpiece_vocab import (
    bert_vocab_from_dataset,
)
import pandas as pd

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# imports for pre-parsing
from pyparsing import Word, alphas as pp_alpha, nums as pp_nums
import pyparsing as pp
pp.ParserElement.enablePackrat()
import re

In [3]:
# handshapes

handshapelist = [
  '1',
  '3',
  '4',
  '5',
  '6',
  '7',
  '8',
  '9',
  '10',
  '25',
  'A',
  'B',
  'C',
  'D',
  'E',
  'F',
  'G',
  'H-U',
  'I',
  'K',
  'L',
  'M',
  'N',
  'O',
  'R',
  'S',
  'T',
  'U',
  'V',
  'W',
  'X',
  'Y',
  'C-L',
  'U-L',
  'B-L',
  'P-K',
  'Q-G',
  'L-X',
  'I-L-Y',
  '5-C',
  '5-C-L',
  '5-C-tt',
  'alt-M',
  'alt-N',
  'alt-P',
  'B-xd',
  'baby-O',
  'bent-1',
  'bent-B',
  'bent-B-L',
  'bent-horns',
  'bent-L',
  'bent-M',
  'bent-N',
  'bent-U',
  'cocked-8',
  'cocked-F',
  'cocked-S',
  'crvd-5',
  'crvd-B',
  'crvd-flat-B',
  'crvd-L',
  'crvd-sprd-B',
  'crvd-U',
  'crvd-V',
  'fanned-flat-O',
  'flat-B',
  'flat-C',
  'flat-F',
  'flat-G',
  'flat-O-2',
  'flat-O',
  'full-M',
  'horns',
  'loose-E',
  'O-2-horns',
  'open-8',
  'open-F',
  'sml-C-3',
  'tight-C-2',
  'tight-C',
  'X-over-thumb'
]

In [4]:
# regex rules

alpha_regexp  = r"(?!((?:THUMB-)?(?:IX-|POSS-|SELF-)))[A-Z](?:[A-Z/'-]*[A-Z])?(?:\.)?"
lookahead_regexp = r"(?:(?![a-z])|(?=wg))"

word_all_regexp = r"""(?x)
    (?: %s )
    %s
""" % (alpha_regexp, lookahead_regexp)

In [5]:
# conventions kept for parsing

cl_prefix = pp.one_of(["CL", "DCL", "LCL", "SCL", "BCL", "BPCL", "PCL", "ICL"])
ns_prefix = pp.Literal("ns")
fs_prefix = pp.Literal("fs")
lex_exceptions = pp.one_of(["part", "WHAT"])
aspect_text = pp.Literal("aspect")
index_core_ix = pp.Literal("IX")
other_index_core = pp.one_of(["POSS", "SELF"])
handshape = pp.one_of(handshapelist)
person = pp.one_of(["1p", "2p", "3p"])
arc = pp.Literal("arc") 
loc = pp.Literal("loc")
pl = pp.Literal("pl")
compound = pp.Literal("+")
hashtag = pp.Literal("#")
sym = pp.Literal(">")
par1 = pp.Literal("(")
par2 = pp.Literal(")")
dash = pp.Literal("-")
contraction = pp.Literal("^")
colon = pp.Literal(":")
omit_quote = pp.Literal("xx")
period = pp.Literal(".")
alpha = pp.Word(pp_alpha, max=1)
num = pp.Word(pp_nums, max=1)
word = pp.Regex(word_all_regexp, flags=re.X)

In [6]:
# grammar rules

full_grammar = pp.OneOrMore(
    cl_prefix |               # classifiers like CL, DCL, etc.
    word |
    ns_prefix |               # non-specific ns
    fs_prefix |               # fingerspelling fs
    index_core_ix |           # IX
    other_index_core |        # POSS, SELF
    person |                  # 1p, 2p, 3p
    lex_exceptions |          # part, WHAT
    aspect_text |             # aspect
    arc |                     # arc
    loc |                     # loc
    pl |                      # pl
    handshape |               # handshapes like B, 1, 5, etc.
    compound |                # +
    hashtag |                 # #
    sym |                     # >
    contraction |             # ^
    colon |                   # :
    par1 | par2 |             # ( and )
    omit_quote |              # xx
    period |                  # .
    dash |
    num |
    alpha                     # fallback LAST
)

In [7]:
# testing grammar parsing

trial = full_grammar.parse_string("SCL:1xx", parse_all=True).asList()
trial2 = full_grammar.parse_string("IX-1p BCLxx FIND/FIND-OUT fs-HER", parse_all=True).asList()

print(trial)
print(trial2)

['SCL', ':', '1', 'xx']
['IX', '-', '1p', 'BCL', 'xx', 'FIND/FIND-OUT', 'fs', '-', 'HER']


In [8]:
# tokenize based on predefined grammar rules

def custom_asl_tokenize(text):
    try:
        return full_grammar.parse_string(text, parse_all=True).asList()
    except pp.ParseException as pe:
        print(f"Failed to parse: {pe}")
        return []

In [9]:
def custom_eng_tokenize(text):
    # Perserve punctuation and digits
    text = re.sub(r'([^\w\s]|\d)', r' \1 ', text)
    # Convert to lowercase
    text = text.lower()
    # Split on whitespace
    tokens = text.split()
    return tokens

In [10]:
# testing custom_asl_tokenize

trial = custom_asl_tokenize("SCL:1xx")
trial2 = custom_asl_tokenize('SCL:1xx SHOWER WASH FEEL THUMBS-UP/GOOD')

print(trial)
print(trial2)

['SCL', ':', '1', 'xx']
['SCL', ':', '1', 'xx', 'SHOWER', 'WASH', 'FEEL', 'THUMBS-UP/GOOD']


In [11]:
# testing custom_eng_tokenize

trial = custom_eng_tokenize("But all the same, he told me I better go downstairs and get an x-ray.")
trial2 = custom_eng_tokenize('I waited 234 years and 2,142 days')

print(trial)
print(trial2)

['but', 'all', 'the', 'same', ',', 'he', 'told', 'me', 'i', 'better', 'go', 'downstairs', 'and', 'get', 'an', 'x', '-', 'ray', '.']
['i', 'waited', '2', '3', '4', 'years', 'and', '2', ',', '1', '4', '2', 'days']


In [12]:
# model parameters / hyperparameters

BATCH_SIZE = 16
EPOCHS = 30 
EMBED_DIM = 128
INTERMEDIATE_DIM = 512
NUM_HEADS = 4

MAX_SEQUENCE_LENGTH = 100
ENG_VOCAB_SIZE = 1678 + 4
ASL_VOCAB_SIZE = 1117 + 4
num_samples = 1400

data_path = "/Users/adrianajimenez/Desktop/Downloads/REUAICT/Real-Code/2025-ASL-data/data/sent_pairs.txt"

In [13]:
# generate
    # 1) list of eng-asl sentence pairs
    # 2) set of unique english vocab
    # 3) set of unique asl vocab

text_pairs = []
eng_tokens = set()
asl_tokens = set()

with open(data_path, "r", encoding="utf-8") as f:
    lines = f.read().split("\n")
 
for line in lines:
    pair = []
    eng_text, asl_text = line.split("\t")
    pair.append(eng_text.lower())
    pair.append(asl_text)
    text_pairs.append(pair)

for pair in text_pairs:
    sent_tokens = custom_eng_tokenize(pair[0])
    for token in sent_tokens:
        if token not in eng_tokens:
            eng_tokens.add(token)
            
for pair in text_pairs:
    sent_tokens = custom_asl_tokenize(pair[1])
    for token in sent_tokens:
        if token not in asl_tokens:
            asl_tokens.add(token)

eng_tokens = sorted(list(eng_tokens))
asl_tokens = sorted(list(asl_tokens))

print("eng_tokens:", eng_tokens)
print("asl_tokens", asl_tokens)
num_encoder_tokens = len(eng_tokens)
num_decoder_tokens = len(asl_tokens)
print("num_eng_tokens", num_encoder_tokens)
print("num_asl_tokens", num_decoder_tokens)

Failed to parse: Expected end of text, found '/'  (at char 19), (line:1, col:20)
Failed to parse: Expected end of text, found '/'  (at char 23), (line:1, col:24)
eng_tokens: ['!', '"', '$', "'", '(', ')', '*', ',', '-', '.', '/', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '?', 'a', 'aaahhh', 'abdominal', 'able', 'about', 'accepted', 'accident', 'across', 'act', 'action', 'actually', 'admit', 'advantage', 'adventure', 'advice', 'after', 'again', 'against', 'ago', 'agree', 'ah', 'ahead', 'ahh', 'ail', 'air', 'ali', 'all', 'alone', 'along', 'already', 'alright', 'also', 'am', 'amazing', 'an', 'and', 'angeles', 'angled', 'angry', 'animal', 'ankle', 'ann', 'another', 'answer', 'any', 'anymore', 'anyone', 'anything', 'anyway', 'anywhere', 'applaud', 'apple', 'approach', 'are', 'area', 'arm', 'arms', 'around', 'arrest', 'arrested', 'arrive', 'arrived', 'arrives', 'arriving', 'as', 'aside', 'ask', 'asked', 'asks', 'assistant', 'at', 'ate', 'attack', 'attacker', 'attention', 'a

In [14]:
# glimpse pairs

for _ in range(5):
    print(random.choice(text_pairs))

['john was reading that book, what did he say it is about?', 'IX-3p fs-JOHN READ BOOK IX-3p SAY IX-3p xx ABOUT IX-3p']
['did the teacher already buy a house yesterday?', 'YESTERDAY TEACHER BUY HOUSE FINISH']
['i looked at cows while i was driving and i felt sad for them because they couldn\'t even move. at end of the "jail", i saw a butcher shop where they were slaughtering the cows. i also saw some dead cows that were hanging!', 'IX-1p REALLY SAD COW CANNOT MOVE REALLY SEE (5)WOW WOW/AWFUL SCL:3xx REALLY END OF THAT DCL:4xx IX-loc HAVE fs-BUTCHER-SHOP REALLY ICLxx SHOW DEAD COW BPCL:2xx']
['as i was driving, i was really bored. so i started thinking about what i could do.', 'DRIVE BORE IX-1p DRIVE #DO']
['will john finish reading soon?', 'IX-3p fs-JOHN READ IX-3p FINISH (L)FUTURE SOON IX-3p']


In [15]:
# split data

random.shuffle(text_pairs)
num_val_samples = int(0.15 * len(text_pairs))
num_train_samples = len(text_pairs) - 2 * num_val_samples
train_pairs = text_pairs[:num_train_samples]
val_pairs = text_pairs[num_train_samples : num_train_samples + num_val_samples]
test_pairs = text_pairs[num_train_samples + num_val_samples :]

print(f"{len(text_pairs)} total pairs")
print(f"{len(train_pairs)} training pairs")
print(f"{len(val_pairs)} validation pairs")
print(f"{len(test_pairs)} test pairs")

1406 total pairs
986 training pairs
210 validation pairs
210 test pairs


In [16]:
def train_word_piece(text_samples, vocab_size, reserved_tokens):
    word_piece_ds = tf_data.Dataset.from_tensor_slices(text_samples)
    vocab = keras_hub.tokenizers.compute_word_piece_vocabulary(
        word_piece_ds.batch(500).prefetch(2),
        vocabulary_size=vocab_size,
        reserved_tokens=reserved_tokens,
    )
    return vocab

In [17]:
reserved_tokens = ["[PAD]", "[UNK]", "[START]", "[END]"]

eng_samples = [text_pair[0] for text_pair in train_pairs]
trained_eng_vocab = train_word_piece(eng_samples, ENG_VOCAB_SIZE, reserved_tokens)

asl_samples = [text_pair[1] for text_pair in train_pairs]
trained_asl_vocab = train_word_piece(asl_samples, ASL_VOCAB_SIZE, reserved_tokens)

print(trained_eng_vocab)
print(trained_asl_vocab)

2025-07-15 23:50:30.111494: I metal_plugin/src/device/metal_device.cc:1154] Metal device set to: Apple M2
2025-07-15 23:50:30.111525: I metal_plugin/src/device/metal_device.cc:296] systemMemory: 8.00 GB
2025-07-15 23:50:30.111547: I metal_plugin/src/device/metal_device.cc:313] maxCacheSize: 2.67 GB
I0000 00:00:1752637830.112142 9896833 pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
I0000 00:00:1752637830.112212 9896833 pluggable_device_factory.cc:271] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)
2025-07-15 23:50:30.916491: I tensorflow/core/framework/local_rendezvous.cc:407] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
2025-07-15 23:50:32.371043: I tensorflow/core/framework/local_rendezvous.cc:407] Local rendezvous is aborting with

['[PAD]', '[UNK]', '[START]', '[END]', '!', '"', '$', "'", '(', ')', '*', ',', '-', '.', '/', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '?', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 'the', 'to', 'and', '##s', 'john', 'it', 'was', 'he', 'that', 'in', '##d', '##ed', '##y', '##e', '##ing', 'is', 'car', 'my', 'book', 'of', '##t', '##n', 'we', 'said', 'house', 'mother', 'will', 'you', 'teacher', 'for', 'really', 'did', 'his', 'buy', 'on', 'at', '##er', 'they', 'up', '##es', 'but', '##k', 'mary', '##r', 'have', 'know', 'me', 'had', 'reading', 'there', '##m', 'she', 'him', 'like', 'so', '##ay', '##a', '##ly', 'as', 'back', 'who', 'then', '##h', 'father', 'read', 'when', 'not', 'were', 'would', '##l', 'went', 'what', 'about', 'one', '##le', 'chocolate', 'down', 'looked', 'some', '##it', 'out', 'student', 'all', 'books', 'give', 'likes', 'over', '##g', '##st', 'many', 'wolf', 'cop', 'don',

In [18]:
print("English Tokens: ", trained_eng_vocab[100:110])
print("ASL Tokens: ", trained_asl_vocab[100:110])

English Tokens:  ['me', 'had', 'reading', 'there', '##m', 'she', 'him', 'like', 'so', '##ay']
ASL Tokens:  ['##Y', 'BCLxx', 'arc', 'LIKE', 'SAME', 'HOUSE', 'LOOK', '##ER', 'ICLxx', '##L']


In [19]:
eng_tokenizer = keras_hub.tokenizers.WordPieceTokenizer(
    vocabulary=trained_eng_vocab, lowercase=False
)
asl_tokenizer = keras_hub.tokenizers.WordPieceTokenizer(
    vocabulary=trained_asl_vocab, lowercase=False 
)

In [20]:
eng_input_ex = text_pairs[0][0]
eng_tokens_ex = eng_tokenizer.tokenize(eng_input_ex)
print("English sentence: ", eng_input_ex)
print("Tokens: ", eng_tokens_ex)
print(
    "Recovered text after detokenizing: ",
    eng_tokenizer.detokenize(eng_tokens_ex),
)

print()

asl_input_ex = text_pairs[0][1]
asl_tokens_ex = asl_tokenizer.tokenize(asl_input_ex)
print("ASL sentence: ", asl_input_ex)
print("Tokens: ", asl_tokens_ex)
print(
    "Recovered text after detokenizing: ",
    asl_tokenizer.detokenize(asl_tokens_ex),
)

English sentence:  if john needs more information, he will read the book.
Tokens:  tf.Tensor(
[180  58  41  67  65  57 265  63 359 188 104 289  11  61  80 118  54  72
  13], shape=(19,), dtype=int32)
Recovered text after detokenizing:  if john needs more information , he will read the book .

ASL sentence:  fs-JOHN NEED MORE INFORMATION IX-3p FUTURE READ BOOK
Tokens:  tf.Tensor([ 74   8  78  36 264 191 302 307 250  73   8  76  92  90  80], shape=(15,), dtype=int32)
Recovered text after detokenizing:  fs - JOHN NEED MORE INFORMATION IX - 3p FUTURE READ BOOK


In [21]:
def preprocess_batch(eng, asl):
    eng = eng_tokenizer(eng)
    asl = asl_tokenizer(asl)

    # Pad `eng` to `MAX_SEQUENCE_LENGTH`.
    eng_start_end_packer = keras_hub.layers.StartEndPacker(
        sequence_length=MAX_SEQUENCE_LENGTH,
        pad_value=eng_tokenizer.token_to_id("[PAD]"),
    )
    eng = eng_start_end_packer(eng)

    # Add special tokens (`"[START]"` and `"[END]"`) to `asl` and pad it as well.
    asl_start_end_packer = keras_hub.layers.StartEndPacker(
        sequence_length=MAX_SEQUENCE_LENGTH + 1,
        start_value=asl_tokenizer.token_to_id("[START]"),
        end_value=asl_tokenizer.token_to_id("[END]"),
        pad_value=asl_tokenizer.token_to_id("[PAD]"),
    )
    asl = asl_start_end_packer(asl)

    return (
        {
            "encoder_inputs": eng,
            "decoder_inputs": asl[:, :-1],
        },
        asl[:, 1:],
    )


def make_dataset(pairs):
    eng_texts, asl_texts = zip(*pairs)
    eng_texts = list(eng_texts)
    asl_texts = list(asl_texts)
    dataset = tf_data.Dataset.from_tensor_slices((eng_texts, asl_texts))
    dataset = dataset.batch(BATCH_SIZE)
    dataset = dataset.map(preprocess_batch, num_parallel_calls=tf_data.AUTOTUNE)
    return dataset.shuffle(1400).prefetch(16).cache()


train_ds = make_dataset(train_pairs)
val_ds = make_dataset(val_pairs)

In [22]:
for inputs, targets in train_ds.take(1):
    print(f'inputs["encoder_inputs"].shape: {inputs["encoder_inputs"].shape}')
    print(f'inputs["decoder_inputs"].shape: {inputs["decoder_inputs"].shape}')
    print(f"targets.shape: {targets.shape}")

inputs["encoder_inputs"].shape: (16, 100)
inputs["decoder_inputs"].shape: (16, 100)
targets.shape: (16, 100)


2025-07-15 23:50:34.438976: W tensorflow/core/kernels/data/cache_dataset_ops.cc:916] The calling iterator did not fully read the dataset being cached. In order to avoid unexpected truncation of the dataset, the partially cached contents of the dataset  will be discarded. This can happen if you have an input pipeline similar to `dataset.cache().take(k).repeat()`. You should use `dataset.take(k).cache().repeat()` instead.


In [23]:
# Encoder
encoder_inputs = keras.Input(shape=(None,), name="encoder_inputs")

x = keras_hub.layers.TokenAndPositionEmbedding(
    vocabulary_size=ENG_VOCAB_SIZE,
    sequence_length=MAX_SEQUENCE_LENGTH,
    embedding_dim=EMBED_DIM,
)(encoder_inputs)

encoder_outputs = keras_hub.layers.TransformerEncoder(
    intermediate_dim=INTERMEDIATE_DIM, num_heads=NUM_HEADS
)(inputs=x)
encoder = keras.Model(encoder_inputs, encoder_outputs)


# Decoder
decoder_inputs = keras.Input(shape=(None,), name="decoder_inputs")
encoded_seq_inputs = keras.Input(shape=(None, EMBED_DIM), name="decoder_state_inputs")

x = keras_hub.layers.TokenAndPositionEmbedding(
    vocabulary_size=ASL_VOCAB_SIZE,
    sequence_length=MAX_SEQUENCE_LENGTH,
    embedding_dim=EMBED_DIM,
)(decoder_inputs)

x = keras_hub.layers.TransformerDecoder(
    intermediate_dim=INTERMEDIATE_DIM, num_heads=NUM_HEADS
)(decoder_sequence=x, encoder_sequence=encoded_seq_inputs)
x = keras.layers.Dropout(0.5)(x)
decoder_outputs = keras.layers.Dense(ASL_VOCAB_SIZE, activation="softmax")(x)
decoder = keras.Model(
    [
        decoder_inputs,
        encoded_seq_inputs,
    ],
    decoder_outputs,
)
decoder_outputs = decoder([decoder_inputs, encoder_outputs])

transformer = keras.Model(
    [encoder_inputs, decoder_inputs],
    decoder_outputs,
    name="transformer",
)

In [24]:
transformer.summary()
transformer.compile(
    "rmsprop", loss="sparse_categorical_crossentropy", metrics=["accuracy"]
)
transformer.fit(train_ds, epochs=EPOCHS, validation_data=val_ds)

Epoch 1/30


2025-07-15 23:50:36.233451: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:117] Plugin optimizer for device_type GPU is enabled.


[1m62/62[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 143ms/step - accuracy: 0.7660 - loss: 2.1798 - val_accuracy: 0.8557 - val_loss: 0.8114
Epoch 2/30
[1m62/62[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 107ms/step - accuracy: 0.8546 - loss: 0.8161 - val_accuracy: 0.8643 - val_loss: 0.7080
Epoch 3/30
[1m62/62[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 97ms/step - accuracy: 0.8613 - loss: 0.7168 - val_accuracy: 0.8683 - val_loss: 0.6661
Epoch 4/30
[1m62/62[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 99ms/step - accuracy: 0.8655 - loss: 0.6696 - val_accuracy: 0.8690 - val_loss: 0.6406
Epoch 5/30
[1m62/62[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 102ms/step - accuracy: 0.8681 - loss: 0.6389 - val_accuracy: 0.8695 - val_loss: 0.6245
Epoch 6/30
[1m62/62[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 100ms/step - accuracy: 0.8681 - loss: 0.6192 - val_accuracy: 0.8695 - val_loss: 0.6156
Epoch 7/30
[1m62/62[0m [32m━━━━━━━━━━

<keras.src.callbacks.history.History at 0x16a14ba90>

In [26]:
def decode_sequences(input_sentences):
    with tf.device('/CPU:0'):
        batch_size = 1

        # Tokenize the encoder input.
        encoder_input_tokens = ops.convert_to_tensor(eng_tokenizer(input_sentences))
        if len(encoder_input_tokens[0]) < MAX_SEQUENCE_LENGTH:
            pads = ops.full((1, MAX_SEQUENCE_LENGTH - len(encoder_input_tokens[0])), 0)
            encoder_input_tokens = ops.concatenate(
                [encoder_input_tokens, pads], 1
            )

        # Define a function that outputs the next token's probability given the
        # input sequence.
        def next(prompt, cache, index):
            logits = transformer([encoder_input_tokens, prompt])[:, index - 1, :]
            # Ignore hidden states for now; only needed for contrastive search.
            hidden_states = None
            return logits, hidden_states, cache

        # Build a prompt of length 40 with a start token and padding tokens.
        length = 40
        start = ops.full((batch_size, 1), asl_tokenizer.token_to_id("[START]"))
        pad = ops.full((batch_size, length - 1), asl_tokenizer.token_to_id("[PAD]"))
        prompt = ops.concatenate((start, pad), axis=-1)


        generated_tokens = keras_hub.samplers.GreedySampler()(
                next,
                prompt,
                stop_token_ids=[asl_tokenizer.token_to_id("[END]")],
                index=1,  # Start sampling after start token.
            )
        generated_sentences = asl_tokenizer.detokenize(generated_tokens)
    return generated_sentences

outputs = []

test_eng_texts = [pair[0] for pair in test_pairs]
for i in range(200):
    output_pairs = []
    input_sentence = random.choice(test_eng_texts)
    translated = decode_sequences([input_sentence])
    translated = translated[0]
    translated = (
        translated.replace("[PAD]", "")
        .replace("[START]", "")
        .replace("[END]", "")
        .strip()
    )
    output_pairs.append(input_sentence)
    output_pairs.append(translated)
    outputs.append(output_pairs)
    
df = pd.DataFrame(outputs, columns=["input sentence", "translation"])
df.to_csv("/Users/adrianajimenez/Desktop/Downloads/REUAICT/Real-Code/2025-ASL-data/seq2seq_code/word_level/outputs50.csv", index=False)

In [27]:
rouge_1 = keras_hub.metrics.RougeN(order=1)
rouge_2 = keras_hub.metrics.RougeN(order=2)

for test_pair in test_pairs:
    input_sentence = test_pair[0]
    reference_sentence = test_pair[1]

    translated_sentence = decode_sequences([input_sentence])
    translated_sentence = translated_sentence[0]
    translated_sentence = (
        translated_sentence.replace("[PAD]", "")
        .replace("[START]", "")
        .replace("[END]", "")
        .strip()
    )

    rouge_1(reference_sentence, translated_sentence)
    rouge_2(reference_sentence, translated_sentence)

print("ROUGE-1 Score: ", rouge_1.result())
print("ROUGE-2 Score: ", rouge_2.result())

ROUGE-1 Score:  {'precision': <tf.Tensor: shape=(), dtype=float32, numpy=0.42068034410476685>, 'recall': <tf.Tensor: shape=(), dtype=float32, numpy=0.32498180866241455>, 'f1_score': <tf.Tensor: shape=(), dtype=float32, numpy=0.34988173842430115>}
ROUGE-2 Score:  {'precision': <tf.Tensor: shape=(), dtype=float32, numpy=0.21741840243339539>, 'recall': <tf.Tensor: shape=(), dtype=float32, numpy=0.16280566155910492>, 'f1_score': <tf.Tensor: shape=(), dtype=float32, numpy=0.17630094289779663>}
