In [1]:
import keras_hub
import random

import keras
from keras import ops

import tensorflow.data as tf_data
import tensorflow as tf
from tensorflow_text.tools.wordpiece_vocab import (
    bert_vocab_from_dataset,
)
import pandas as pd

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# imports for pre-parsing
from pyparsing import Word, alphas as pp_alpha, nums as pp_nums
import pyparsing as pp
pp.ParserElement.enablePackrat()
import re

In [None]:
# regex rules

alpha_regexp  = r"(?!((?:THUMB-)?(?:IX|POSS|SELF)))[A-Z](?:[A-Z/_'-]*[A-Z])?(?:\.)?"
lookahead_regexp = r"(?:(?![a-z])|(?=wg))"

word_all_regexp = r"""(?x)
    (?: %s )
    %s
""" % (alpha_regexp, lookahead_regexp)

In [4]:
# conventions kept for parsing

cl_prefix = pp.one_of(["CL", "DCL", "LCL", "SCL", "BCL", "BPCL", "PCL", "ICL"])
fs_prefix = pp.Literal("fs-")
index_core_ix = pp.Literal("IX")
other_index_core = pp.one_of(["POSS", "SELF"])
compound = pp.Literal("+")
hashtag = pp.Literal("#")
sym = pp.Literal(">")
par1 = pp.Literal("(")
par2 = pp.Literal(")")
dash = pp.Literal("-")
contraction = pp.Literal("^")
colon = pp.Literal(":")
omit_quote = pp.Literal("xx")
period = pp.Literal(".")
alpha = pp.Word(pp_alpha, max=1)
num = pp.Word(pp_nums, max=1)
word = pp.Regex(word_all_regexp, flags=re.X)

In [None]:
# grammar rules

full_grammar = pp.OneOrMore(
    cl_prefix |               # classifiers like CL, DCL, etc.
    fs_prefix |               # fingerspelling fs
    index_core_ix |           # IX
    other_index_core |        # POSS, SELF
    word |
    compound |                # +
    hashtag |                 # #
    sym |                     # >
    contraction |             # ^
    colon |                   # :
    par1 | par2 |             # ( and )
    omit_quote |              # xx
    period |                  # .
    dash |
    num |
    alpha                     # fallback LAST
)

In [6]:
# testing grammar parsing

trial = full_grammar.parse_string("SCL:1xx", parse_all=True).asList()
trial2 = full_grammar.parse_string("IX-1p BCLxx FIND/FIND-OUT fs-HER", parse_all=True).asList()

print(trial)
print(trial2)

['SCL', ':', '1', 'xx']
['IX', '-', '1', 'p', 'BCL', 'xx', 'FIND/FIND-OUT', 'fs-', 'HER']


In [7]:
# tokenize based on predefined grammar rules

def custom_asl_tokenize(text):
    try:
        return full_grammar.parse_string(text, parse_all=True).asList()
    except pp.ParseException as pe:
        print(f"Failed to parse: {pe}")
        return []

In [8]:
def custom_eng_tokenize(text):
    # Perserve punctuation and digits
    text = re.sub(r'([^\w\s]|\d)', r' \1 ', text)
    # Convert to lowercase
    text = text.lower()
    # Split on whitespace
    tokens = text.split()
    return tokens

In [9]:
# testing custom_asl_tokenize

trial = custom_asl_tokenize("SCL:1xx")
trial2 = custom_asl_tokenize('SCL:1xx SHOWER WASH FEEL THUMBS-UP/GOOD')

print(trial)
print(trial2)

['SCL', ':', '1', 'xx']
['SCL', ':', '1', 'xx', 'SHOWER', 'WASH', 'FEEL', 'THUMBS-UP/GOOD']


In [10]:
# testing custom_eng_tokenize

trial = custom_eng_tokenize("But all the same, he told me I better go downstairs and get an x-ray.")
trial2 = custom_eng_tokenize('I waited 234 years and 2,142 days')

print(trial)
print(trial2)

['but', 'all', 'the', 'same', ',', 'he', 'told', 'me', 'i', 'better', 'go', 'downstairs', 'and', 'get', 'an', 'x', '-', 'ray', '.']
['i', 'waited', '2', '3', '4', 'years', 'and', '2', ',', '1', '4', '2', 'days']


In [11]:
# model parameters / hyperparameters

BATCH_SIZE = 16
EPOCHS = 20
EMBED_DIM = 128
INTERMEDIATE_DIM = 512
NUM_HEADS = 4

MAX_SEQUENCE_LENGTH = 150
ENG_VOCAB_SIZE = 3092 + 4
ASL_VOCAB_SIZE = 1810 + 4
num_samples = 3380

data_path = "/Users/adrianajimenez/Desktop/Downloads/REUAICT/Real-Code/2025-ASL-data/sent_pairs_joined.txt"

In [12]:
def isolate_cl(text):
    cl_pre = ("CL", "DCL", "LCL", "SCL", "BCL", "BPCL", "PCL", "ICL")
    if text.startswith(cl_pre) and ":" in text:
        index = text.index(":")
        text = text[:index]
    return text

In [13]:
# generate
    # 1) list of eng-asl sentence pairs
    # 2) set of unique english vocab
    # 3) set of unique asl vocab

text_pairs = []
eng_tokens = set()
asl_tokens = set()

with open(data_path, "r", encoding="utf-8") as f:
    lines = f.read().split("\n")

for line in lines:
    pair = []
    eng_text, asl_text = line.split("\t")
    glosses = asl_text.split()
    revised_glosses = [isolate_cl(gloss) for gloss in glosses]
    asl_text = " ".join(revised_glosses)
    pair.append(eng_text.lower())
    pair.append(asl_text)
    text_pairs.append(pair)

for pair in text_pairs:
    sent_tokens = custom_eng_tokenize(pair[0])
    for token in sent_tokens:
        if token not in eng_tokens:
            eng_tokens.add(token)
            
for pair in text_pairs:
    sent_tokens = custom_asl_tokenize(pair[1])
    for token in sent_tokens:
        if token not in asl_tokens:
            asl_tokens.add(token)

eng_tokens = sorted(list(eng_tokens))
asl_tokens = sorted(list(asl_tokens))

print("eng_tokens:", eng_tokens)
print("asl_tokens", asl_tokens)
num_encoder_tokens = len(eng_tokens)
num_decoder_tokens = len(asl_tokens)
print("num_eng_tokens", num_encoder_tokens)
print("num_asl_tokens", num_decoder_tokens)

Failed to parse: Expected end of text, found '_'  (at char 4), (line:1, col:5)
Failed to parse: Expected end of text, found '_'  (at char 47), (line:1, col:48)
Failed to parse: Expected end of text, found '_'  (at char 37), (line:1, col:38)
Failed to parse: Expected end of text, found '_'  (at char 83), (line:1, col:84)
Failed to parse: Expected end of text, found '_'  (at char 22), (line:1, col:23)
Failed to parse: Expected end of text, found '_'  (at char 15), (line:1, col:16)
Failed to parse: Expected end of text, found '_'  (at char 37), (line:1, col:38)
Failed to parse: Expected end of text, found '_'  (at char 17), (line:1, col:18)
Failed to parse: Expected end of text, found '_'  (at char 72), (line:1, col:73)
Failed to parse: Expected end of text, found '_'  (at char 22), (line:1, col:23)
Failed to parse: Expected end of text, found '_'  (at char 47), (line:1, col:48)
Failed to parse: Expected end of text, found '_'  (at char 119), (line:1, col:120)
Failed to parse: Expected en

In [14]:
# glimpse pairs

for _ in range(5):
    print(random.choice(text_pairs))

['really, all the way through nebraska...', 'REALLY THROUGH fs-NEB']
['john wants to sell his car in the future.', 'fs-JOHN WANT SELL CAR FUTURE']
['friends go to the beach to sunbathe because they are pale.', 'FRIEND GROUP/TOGETHER-pl fs-BEACH SUNBATHE DCL SUNBATHE (25)WHY IX fs-PALE']
['mother walked to the store.', 'IX MOTHER IX SCL GO-OUT SELL']
['if the president is elected again, my father will be upset.', 'IF PRESIDENT VOTE AGAIN POSS FATHER fs-UPSET FUTURE']


In [15]:
# split data

random.shuffle(text_pairs)
num_val_samples = int(0.15 * len(text_pairs))
num_train_samples = len(text_pairs) - 2 * num_val_samples
train_pairs = text_pairs[:num_train_samples]
val_pairs = text_pairs[num_train_samples : num_train_samples + num_val_samples]
test_pairs = text_pairs[num_train_samples + num_val_samples :]

print(f"{len(text_pairs)} total pairs")
print(f"{len(train_pairs)} training pairs")
print(f"{len(val_pairs)} validation pairs")
print(f"{len(test_pairs)} test pairs")

3389 total pairs
2373 training pairs
508 validation pairs
508 test pairs


In [16]:
def train_word_piece(text_samples, vocab_size, reserved_tokens):
    word_piece_ds = tf_data.Dataset.from_tensor_slices(text_samples)
    vocab = keras_hub.tokenizers.compute_word_piece_vocabulary(
        word_piece_ds.batch(500).prefetch(2),
        vocabulary_size=vocab_size,
        reserved_tokens=reserved_tokens,
    )
    return vocab

In [17]:
reserved_tokens = ["[PAD]", "[UNK]", "[START]", "[END]"]

eng_samples = [text_pair[0] for text_pair in train_pairs]
trained_eng_vocab = train_word_piece(eng_samples, ENG_VOCAB_SIZE, reserved_tokens)

asl_samples = [text_pair[1] for text_pair in train_pairs]
trained_asl_vocab = train_word_piece(asl_samples, ASL_VOCAB_SIZE, reserved_tokens)

print(trained_eng_vocab)
print(trained_asl_vocab)

2025-07-16 13:16:29.994443: I metal_plugin/src/device/metal_device.cc:1154] Metal device set to: Apple M2
2025-07-16 13:16:29.995384: I metal_plugin/src/device/metal_device.cc:296] systemMemory: 8.00 GB
2025-07-16 13:16:29.996972: I metal_plugin/src/device/metal_device.cc:313] maxCacheSize: 2.67 GB
I0000 00:00:1752686190.003917 11484742 pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
I0000 00:00:1752686190.007227 11484742 pluggable_device_factory.cc:271] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)
2025-07-16 13:16:30.826088: I tensorflow/core/framework/local_rendezvous.cc:407] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
2025-07-16 13:16:33.626015: I tensorflow/core/framework/local_rendezvous.cc:407] Local rendezvous is aborting wi

['[PAD]', '[UNK]', '[START]', '[END]', '!', '"', '$', "'", '(', ')', '*', ',', '-', '.', '/', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '?', '`', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '—', '“', '”', '…', 'the', 'to', 'and', '##s', 'it', 'is', 'he', 'my', 'that', 'was', 'you', 'in', 'john', '##ing', 'of', 'if', 'will', '##ed', 'for', 'have', 'when', 'did', 'they', 'car', '##d', 'there', 'at', 'on', 'because', 'who', 'we', '##y', 'no', 'but', 'out', 'friend', 'book', 'deaf', 'not', 'with', '##t', 'go', 'are', 'be', 'she', 'his', 'what', 'buy', '##n', 'like', 'mother', 'mary', '##e', '##er', 'people', 'up', 'so', 'do', 'me', '##a', 'had', 'know', 'teacher', 'as', 'friends', 'house', 'has', 'really', 'would', 'said', '##ly', 'can', 'person', 'don', '##k', 'all', 'going', 'her', 'went', 'father', '##r', 'him', 'didn', 'how', 'one', 'then', 'many', '##l', 'good', 'their', 'time', '##

In [18]:
print("English Tokens: ", trained_eng_vocab[100:110])
print("ASL Tokens: ", trained_asl_vocab[100:110])

English Tokens:  ['go', 'are', 'be', 'she', 'his', 'what', 'buy', '##n', 'like', 'mother']
ASL Tokens:  ['FATHER', 'UP', 'MUST', 'FOR', 'NOW', '25', 'TO', 'READ', 'ICL', 'NIGHT']


In [19]:
eng_tokenizer = keras_hub.tokenizers.WordPieceTokenizer(
    vocabulary=trained_eng_vocab, lowercase=False
)
asl_tokenizer = keras_hub.tokenizers.WordPieceTokenizer(
    vocabulary=trained_asl_vocab, lowercase=False 
)

In [20]:
eng_input_ex = text_pairs[0][0]
eng_tokens_ex = eng_tokenizer.tokenize(eng_input_ex)
print("English sentence: ", eng_input_ex)
print("Tokens: ", eng_tokens_ex)
print(
    "Recovered text after detokenizing: ",
    eng_tokenizer.detokenize(eng_tokens_ex),
)

print()

asl_input_ex = text_pairs[0][1]
asl_tokens_ex = asl_tokenizer.tokenize(asl_input_ex)
print("ASL sentence: ", asl_input_ex)
print("Tokens: ", asl_tokens_ex)
print(
    "Recovered text after detokenizing: ",
    asl_tokenizer.detokenize(asl_tokens_ex),
)

English sentence:  the teacher didn't make the reading required. it's uncertain whether or not john will read the book now.
Tokens:  tf.Tensor(
[ 59 121 141   7  48 198  59 177 472 664 665 296  83  13  63   7  47  49
 107 226 139  99 567  51 167 172 424 152  97  71  75 153  59  95 162  13], shape=(36,), dtype=int32)
Recovered text after detokenizing:  the teacher didn ' t make the reading required . it ' s uncertain whether or not john will read the book now .

ASL sentence:  TEACH+AGENT TEACH+AGENT NOT REQUIRE BOOK READ DROP IX fs-JOHN FUTURE READ BOOK
Tokens:  tf.Tensor(
[130   8 111 130   8 111  74  40  98 622 623 287  90 107  26 191 332  72
  73   9  77  83 107  90], shape=(24,), dtype=int32)
Recovered text after detokenizing:  TEACH + AGENT TEACH + AGENT NOT REQUIRE BOOK READ DROP IX fs - JOHN FUTURE READ BOOK


In [21]:
def preprocess_batch(eng, asl):
    eng = eng_tokenizer(eng)
    asl = asl_tokenizer(asl)

    # Pad `eng` to `MAX_SEQUENCE_LENGTH`.
    eng_start_end_packer = keras_hub.layers.StartEndPacker(
        sequence_length=MAX_SEQUENCE_LENGTH,
        pad_value=eng_tokenizer.token_to_id("[PAD]"),
    )
    eng = eng_start_end_packer(eng)

    # Add special tokens (`"[START]"` and `"[END]"`) to `asl` and pad it as well.
    asl_start_end_packer = keras_hub.layers.StartEndPacker(
        sequence_length=MAX_SEQUENCE_LENGTH + 1,
        start_value=asl_tokenizer.token_to_id("[START]"),
        end_value=asl_tokenizer.token_to_id("[END]"),
        pad_value=asl_tokenizer.token_to_id("[PAD]"),
    )
    asl = asl_start_end_packer(asl)

    return (
        {
            "encoder_inputs": eng,
            "decoder_inputs": asl[:, :-1],
        },
        asl[:, 1:],
    )


def make_dataset(pairs):
    eng_texts, asl_texts = zip(*pairs)
    eng_texts = list(eng_texts)
    asl_texts = list(asl_texts)
    dataset = tf_data.Dataset.from_tensor_slices((eng_texts, asl_texts))
    dataset = dataset.batch(BATCH_SIZE)
    dataset = dataset.map(preprocess_batch, num_parallel_calls=tf_data.AUTOTUNE)
    return dataset.shuffle(1400).prefetch(16).cache()


train_ds = make_dataset(train_pairs)
val_ds = make_dataset(val_pairs)

In [22]:
for inputs, targets in train_ds.take(1):
    print(f'inputs["encoder_inputs"].shape: {inputs["encoder_inputs"].shape}')
    print(f'inputs["decoder_inputs"].shape: {inputs["decoder_inputs"].shape}')
    print(f"targets.shape: {targets.shape}")

inputs["encoder_inputs"].shape: (16, 150)
inputs["decoder_inputs"].shape: (16, 150)
targets.shape: (16, 150)


2025-07-16 13:16:36.457956: W tensorflow/core/kernels/data/cache_dataset_ops.cc:916] The calling iterator did not fully read the dataset being cached. In order to avoid unexpected truncation of the dataset, the partially cached contents of the dataset  will be discarded. This can happen if you have an input pipeline similar to `dataset.cache().take(k).repeat()`. You should use `dataset.take(k).cache().repeat()` instead.


In [23]:
# Encoder
encoder_inputs = keras.Input(shape=(None,), name="encoder_inputs")

x = keras_hub.layers.TokenAndPositionEmbedding(
    vocabulary_size=ENG_VOCAB_SIZE,
    sequence_length=MAX_SEQUENCE_LENGTH,
    embedding_dim=EMBED_DIM,
)(encoder_inputs)

encoder_outputs = keras_hub.layers.TransformerEncoder(
    intermediate_dim=INTERMEDIATE_DIM, num_heads=NUM_HEADS
)(inputs=x)
encoder = keras.Model(encoder_inputs, encoder_outputs)


# Decoder
decoder_inputs = keras.Input(shape=(None,), name="decoder_inputs")
encoded_seq_inputs = keras.Input(shape=(None, EMBED_DIM), name="decoder_state_inputs")

x = keras_hub.layers.TokenAndPositionEmbedding(
    vocabulary_size=ASL_VOCAB_SIZE,
    sequence_length=MAX_SEQUENCE_LENGTH,
    embedding_dim=EMBED_DIM,
)(decoder_inputs)

x = keras_hub.layers.TransformerDecoder(
    intermediate_dim=INTERMEDIATE_DIM, num_heads=NUM_HEADS
)(decoder_sequence=x, encoder_sequence=encoded_seq_inputs)
x = keras.layers.Dropout(0.5)(x)
decoder_outputs = keras.layers.Dense(ASL_VOCAB_SIZE, activation="softmax")(x)
decoder = keras.Model(
    [
        decoder_inputs,
        encoded_seq_inputs,
    ],
    decoder_outputs,
)
decoder_outputs = decoder([decoder_inputs, encoder_outputs])

transformer = keras.Model(
    [encoder_inputs, decoder_inputs],
    decoder_outputs,
    name="transformer",
)

In [24]:
transformer.summary()
transformer.compile(
    "rmsprop", loss="sparse_categorical_crossentropy", metrics=["accuracy"]
)
transformer.fit(train_ds, epochs=EPOCHS, validation_data=val_ds)

Epoch 1/20


2025-07-16 13:16:38.225538: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:117] Plugin optimizer for device_type GPU is enabled.


[1m149/149[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m29s[0m 156ms/step - accuracy: 0.8594 - loss: 1.6322 - val_accuracy: 0.9102 - val_loss: 0.5263
Epoch 2/20
[1m149/149[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 160ms/step - accuracy: 0.9082 - loss: 0.5348 - val_accuracy: 0.9129 - val_loss: 0.4736
Epoch 3/20
[1m149/149[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 180ms/step - accuracy: 0.9110 - loss: 0.4885 - val_accuracy: 0.9148 - val_loss: 0.4506
Epoch 4/20
[1m149/149[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m35s[0m 237ms/step - accuracy: 0.9121 - loss: 0.4674 - val_accuracy: 0.9154 - val_loss: 0.4378
Epoch 5/20
[1m149/149[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m32s[0m 212ms/step - accuracy: 0.9128 - loss: 0.4534 - val_accuracy: 0.9161 - val_loss: 0.4309
Epoch 6/20
[1m149/149[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m29s[0m 192ms/step - accuracy: 0.9135 - loss: 0.4430 - val_accuracy: 0.9164 - val_loss: 0.4240
Epoch 7/20
[1m149/14

<keras.src.callbacks.history.History at 0x3098bedd0>

In [25]:
def decode_sequences(input_sentences):
    with tf.device('/CPU:0'):
        batch_size = 1

        # Tokenize the encoder input.
        encoder_input_tokens = ops.convert_to_tensor(eng_tokenizer(input_sentences))
        if len(encoder_input_tokens[0]) < MAX_SEQUENCE_LENGTH:
            pads = ops.full((1, MAX_SEQUENCE_LENGTH - len(encoder_input_tokens[0])), 0)
            encoder_input_tokens = ops.concatenate(
                [encoder_input_tokens, pads], 1
            )

        # Define a function that outputs the next token's probability given the
        # input sequence.
        def next(prompt, cache, index):
            logits = transformer([encoder_input_tokens, prompt])[:, index - 1, :]
            # Ignore hidden states for now; only needed for contrastive search.
            hidden_states = None
            return logits, hidden_states, cache

        # Build a prompt of length 40 with a start token and padding tokens.
        length = 40
        start = ops.full((batch_size, 1), asl_tokenizer.token_to_id("[START]"))
        pad = ops.full((batch_size, length - 1), asl_tokenizer.token_to_id("[PAD]"))
        prompt = ops.concatenate((start, pad), axis=-1)


        generated_tokens = keras_hub.samplers.GreedySampler()(
                next,
                prompt,
                stop_token_ids=[asl_tokenizer.token_to_id("[END]")],
                index=1,  # Start sampling after start token.
            )
        generated_sentences = asl_tokenizer.detokenize(generated_tokens)
    return generated_sentences

outputs = []

test_eng_texts = [pair[0] for pair in test_pairs]
for i in range(200):
    output_pairs = []
    input_sentence = random.choice(test_eng_texts)
    translated = decode_sequences([input_sentence])
    translated = translated[0]
    translated = (
        translated.replace("[PAD]", "")
        .replace("[START]", "")
        .replace("[END]", "")
        .strip()
    )
    output_pairs.append(input_sentence)
    output_pairs.append(translated)
    outputs.append(output_pairs)
    
df = pd.DataFrame(outputs, columns=["input sentence", "translation"])
df.to_csv("/Users/adrianajimenez/Desktop/Downloads/REUAICT/Real-Code/2025-ASL-data/seq2seq_code/word_level/joined_outputs1.csv", index=False)

I0000 00:00:1752686745.973149 11484742 service.cc:152] XLA service 0x387911870 initialized for platform Host (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1752686745.973548 11484742 service.cc:160]   StreamExecutor device (0): Host, Default Version
I0000 00:00:1752686746.327616 11484742 device_compiler.h:188] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


In [26]:
rouge_1 = keras_hub.metrics.RougeN(order=1)
rouge_2 = keras_hub.metrics.RougeN(order=2)

for test_pair in test_pairs:
    input_sentence = test_pair[0]
    reference_sentence = test_pair[1]

    translated_sentence = decode_sequences([input_sentence])
    translated_sentence = translated_sentence[0]
    translated_sentence = (
        translated_sentence.replace("[PAD]", "")
        .replace("[START]", "")
        .replace("[END]", "")
        .strip()
    )

    rouge_1(reference_sentence, translated_sentence)
    rouge_2(reference_sentence, translated_sentence)

print("ROUGE-1 Score: ", rouge_1.result())
print("ROUGE-2 Score: ", rouge_2.result())

KeyboardInterrupt: 