In [1]:
import keras_hub
import random

import keras
from keras import ops

import tensorflow.data as tf_data
import tensorflow as tf
from tensorflow_text.tools.wordpiece_vocab import (
    bert_vocab_from_dataset,
)
import pandas as pd
from keras_nlp.samplers import TopKSampler

import numpy as np
import os
from pathlib import Path

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# imports for pre-parsing
from pyparsing import Word, alphas as pp_alpha, nums as pp_nums
import pyparsing as pp
pp.ParserElement.enablePackrat()
import re

In [3]:
class DictTokenizer:
    def __init__(self, vocab, tokenizer_fn):
        self.token_to_id_map = vocab
        self.id_to_token_map = {i: t for t, i in vocab.items()}
        self.tokenizer_fn = tokenizer_fn

    def __call__(self, text_batch):
        return [
            [self.token_to_id_map.get(tok, self.token_to_id_map.get("[UNK]", 0)) 
             for tok in self.tokenizer_fn(text)]
            for text in text_batch
        ]

    def tokenize(self, text):
        return [self.token_to_id_map.get(tok, self.token_to_id_map.get("[UNK]", 0)) 
                for tok in self.tokenizer_fn(text)]

    def detokenize(self, token_ids):
        if isinstance(token_ids, tf.Tensor):
            token_ids = token_ids.numpy()
        elif isinstance(token_ids, tf.RaggedTensor):
            token_ids = token_ids.to_tensor().numpy()
        elif isinstance(token_ids, int):
            token_ids = [token_ids]

        return " ".join([self.id_to_token_map.get(int(tok_id), "[UNK]") for tok_id in token_ids])

    def token_to_id(self, token):
        return self.token_to_id_map.get(token, self.token_to_id_map.get("[UNK]", 0))


In [4]:
# regex rules

alpha_regexp = r"""
(?!((?:THUMB-)?(?:IX|POSS|SELF)))   # negative lookahead for blocked glosses
[A-Z]                               # must start with uppercase
(?:                                 # optional middle section
    (?:                             # non-capturing group for allowed connectors
        (?:[-/][A-Z])               # hyphen or slash must be followed by uppercase
      | (?:_[0-9])                  # underscore must be followed by digit
      | (?:\+(?:[A-Z#]|fs-))       # plus + (uppercase OR # OR the literal fs-)
      | [A-Z0-9]                    # regular letter/digit continuation
    )
)*                                  # repeatable
(?:\.)?                             # optional trailing period
"""

In [5]:
# conventions kept for parsing

cl_prefix = pp.one_of(["CL", "DCL", "LCL", "SCL", "BCL", "BPCL", "PCL", "ICL"])
fs_prefix = pp.Literal("fs-")
index_core_ix = pp.Literal("IX")
other_index_core = pp.one_of(["POSS", "SELF"])
hashtag = pp.Literal("#")
dash = pp.Literal("-")
contraction = pp.Literal("^")
period = pp.Literal(".")
alpha = pp.Word(pp_alpha, max=1)
num = pp.Word(pp_nums, max=1)
word = pp.Regex(alpha_regexp, flags=re.X)

In [6]:
# grammar rules

full_grammar = pp.OneOrMore(
    fs_prefix |               # fingerspelling fs
    word |
    cl_prefix |               # classifiers like CL, DCL, etc.
    index_core_ix |           # IX
    other_index_core |        # POSS, SELF
    hashtag |                 # #
    contraction |             # ^
    period |                  # .
    dash |
    num |
    alpha                     # fallback LAST
)

In [7]:
# tokenize based on predefined grammar rules

def custom_asl_tokenize(text):
    try:
        if "'" in text:
            text = text.replace("'", "")
        if "++" in text:
            text = text.replace("++", "+")
        return full_grammar.parse_string(text, parse_all=True).asList()
    except pp.ParseException as pe:
        print(text)
        print(f"Failed to parse: {pe}")
        return []

In [8]:
def custom_eng_tokenize(text):
    # Perserve punctuation and digits
    text = re.sub(r'([^\w\s]|\d)', r' \1 ', text)
    # Convert to lowercase
    text = text.lower()
    # Split on whitespace
    tokens = text.split()
    return tokens

In [9]:
# generate
    # 1) list of eng-asl sentence pairs
    # 2) set of unique english vocab
    # 3) set of unique asl vocab
data_path = "/Users/adrianajimenez/Desktop/Downloads/REUAICT/Real-Code/2025-ASL-data/sent_pairs_joined.txt"
    
text_pairs = []
eng_texts = []
asl_texts = []
SPECIAL_TOKENS = ["[PAD]", "[START]", "[END]", "[UNK]"]
eng_tokens = set(SPECIAL_TOKENS)
asl_tokens = set(SPECIAL_TOKENS)
max_length = 0

with open(data_path, "r", encoding="utf-8") as f:
    lines = f.read().split("\n")

for line in lines:
    pair = []
    eng_text, asl_text = line.split("\t")
    eng_texts.append(eng_text)
    asl_texts.append(asl_text)
    pair.append(eng_text.lower())
    pair.append(asl_text)
    text_pairs.append(pair)
    
for text in eng_texts:
    tokens = custom_eng_tokenize(text)
    length = len(tokens)
    if length > max_length:
        max_length = length
    for token in tokens:
        if token not in eng_tokens:
                eng_tokens.add(token)
            
for text in asl_texts:
    tokens = custom_asl_tokenize(text)
    length = len(tokens)
    if length > max_length:
        max_length = length
    for token in tokens:
        if token not in asl_tokens:
                asl_tokens.add(token)
                            
max_encoder_seq_length = max([len(txt) for txt in eng_texts])
max_decoder_seq_length = max([len(txt) for txt in asl_texts])

eng_tokens = sorted(list(eng_tokens))
asl_tokens = sorted(list(asl_tokens))

print("eng_tokens:", eng_tokens)
print("asl_tokens", asl_tokens)
num_encoder_tokens = len(eng_tokens)
num_decoder_tokens = len(asl_tokens)
print("num_eng_tokens", num_encoder_tokens)
print("num_asl_tokens", num_decoder_tokens)

asl_tokens ['#', '-', '.', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'A', 'A-LEVEL-ABOVE', 'A-LEVEL-BELOW', 'A-LOT', 'A-OK', 'A-WAYS', 'AA', 'AAA', 'ABANDON', 'ABC', 'ABOUT', 'ABOVE', 'ABOVE_3', 'ABUSE', 'AC', 'ACCEPT', 'ACCIDENT', 'ACCOMMODATE', 'ACROSS', 'ACT', 'ACT+AGENT', 'ACTION', 'ADD-TO', 'ADDICTED', 'ADMIT', 'ADULT-TALL', 'ADVANTAGE', 'ADVENTURE', 'ADVISE', 'ADVISE/INFLUENCE', 'ADVISE/INFLUENCE+AGENT', 'ADVISER', 'AFRAID', 'AFTER', 'AFTERNOON_3', 'AGAIN', 'AGAINST', 'AGE', 'AGE-SIX+HALF', 'AGE-THIRTEEN', 'AGE-TWENTY-ONE', 'AGE-TWENTY-ONE_2', 'AGENT', 'AGREE', 'AIR', 'AIRPLANE', 'ALARM', 'ALCOHOL', 'ALEC-BALDWIN', 'ALI', 'ALL', 'ALL-DAY', 'ALL-GONE', 'ALL-NIGHT', 'ALL-NIGHT_3', 'ALL-THE-WAY', 'ALL-YEARS-HS', 'ALLERGY', 'ALLOW', 'ALL_2', 'ALMOST', 'ALONE', 'ALRIGHT', 'ALSO', 'ALWAYS', 'AMERICAN-AIRLINES', 'AMONG', 'AMONG_2', 'AMY', 'ANALYZE', 'ANALYZE_2', 'ANALYZE_3', 'AND', 'ANGELA', 'ANGRY', 'ANIMAL', 'ANKLE', 'ANN', 'ANNOUNCE', 'ANSWER', 'ANSWER+AGENT', 'ANY', 'ANY+MOR

In [10]:
main_asl_glosses = set()
split_pattern = r"[\/\+\-]"
        
for token in asl_tokens:
    parts = re.split(split_pattern, token)
    if all(part.isalpha() and part.isupper() for part in parts):
        main_asl_glosses.add(token)

In [11]:
# model parameters / hyperparameters

BATCH_SIZE = 32
EPOCHS = 30
EMBED_DIM = 128
INTERMEDIATE_DIM = 512
NUM_HEADS = 4
MAX_SEQUENCE_LENGTH = max_length

In [12]:
# glimpse pairs

for _ in range(5):
    print(random.choice(text_pairs))

['the deaf experience has been shown to be really different. the deaf experience the dominance of sound around them.', 'DEAF EXPERIENCE SHOW DIFFERENT DEAF EXPERIENCE SHOW REALLY fs-HEGEMONY fs-OF NOISE SUPERIOR']
["mary bought her mother's car and gave it to john.", 'fs-MARY BUY POSS MOTHER CAR GIFT fs-JOHN']
['mother will buy a house.', 'MOTHER FUTURE BUY HOUSE']
['i have to admit that sf copies boston a little, but anyway...', 'CITY/COMMUNITY REALLY IX ADMIT IX COPY fs-BOSTON LITTLE-BIT BUT']
["i don't think he was planning to go, but if i see him, i'll have him call you.", 'IX NOT THINK IX PLAN GO IF SEE IX FUTURE INFORM CALL-BY-PHONE IX']


In [13]:
# split data

random.shuffle(text_pairs)
num_val_samples = int(0.15 * len(text_pairs))
num_train_samples = len(text_pairs) - 2 * num_val_samples
train_pairs = text_pairs[:num_train_samples]
val_pairs = text_pairs[num_train_samples : num_train_samples + num_val_samples]
test_pairs = text_pairs[num_train_samples + num_val_samples :]

print(f"{len(text_pairs)} total pairs")
print(f"{len(train_pairs)} training pairs")
print(f"{len(val_pairs)} validation pairs")
print(f"{len(test_pairs)} test pairs")

3389 total pairs
2373 training pairs
508 validation pairs
508 test pairs


In [14]:
eng_vocab = dict([(char, i) for i, char in enumerate(eng_tokens)])
asl_vocab = dict([(char, i) for i, char in enumerate(asl_tokens)])

eng_tokenizer = DictTokenizer(eng_vocab, tokenizer_fn=custom_eng_tokenize)
asl_tokenizer = DictTokenizer(asl_vocab, tokenizer_fn=custom_asl_tokenize)

print(eng_tokenizer)
print(asl_tokenizer)

<__main__.DictTokenizer object at 0x16abbf3d0>
<__main__.DictTokenizer object at 0x16abbf3a0>


In [15]:
eng_input_ex = text_pairs[0][0]
eng_tokens_ex = eng_tokenizer.tokenize(eng_input_ex)
print("English sentence: ", eng_input_ex)
print("Tokens: ", eng_tokens_ex)
print(
    "Recovered text after detokenizing: ",
    eng_tokenizer.detokenize(eng_tokens_ex),
)

print()

asl_input_ex = text_pairs[0][1]
asl_tokens_ex = asl_tokenizer.tokenize(asl_input_ex)
print("ASL sentence: ", asl_input_ex)
print("Tokens: ", asl_tokens_ex)
print(
    "Recovered text after detokenizing: ",
    asl_tokenizer.detokenize(asl_tokens_ex),
)

English sentence:  bob hates reading books.
Tokens:  [312, 1226, 2157, 318, 9]
Recovered text after detokenizing:  bob hates reading books .

ASL sentence:  fs-BOB IX VOMIT/HATE READ BOOK
Tokens:  [2117, 217, 950, 2017, 1503, 222]
Recovered text after detokenizing:  fs- BOB IX VOMIT/HATE READ BOOK


In [16]:
def preprocess_batch(eng, asl):
    eng_start_end_packer = keras_hub.layers.StartEndPacker(
        sequence_length=MAX_SEQUENCE_LENGTH,
        pad_value=eng_tokenizer.token_to_id("[PAD]"),
        dtype="int32"
    )
    eng = eng_start_end_packer(eng)

    asl_start_end_packer = keras_hub.layers.StartEndPacker(
        sequence_length=MAX_SEQUENCE_LENGTH + 1,
        start_value=asl_tokenizer.token_to_id("[START]"),
        end_value=asl_tokenizer.token_to_id("[END]"),
        pad_value=asl_tokenizer.token_to_id("[PAD]"),
        dtype="int32"
    )
    asl = asl_start_end_packer(asl)

    decoder_inputs = asl[:, :-1]
    decoder_outputs = asl[:, 1:]

    return {
        "encoder_inputs": eng,
        "decoder_inputs": decoder_inputs
    }, decoder_outputs


In [17]:
def make_dataset(pairs):
    
    eng_ids = [eng_tokenizer.tokenize(sent) for sent, _ in pairs]    
    asl_ids = [asl_tokenizer.tokenize(sent) for _, sent in pairs]

    # 🛠️ Force token type to int32
    eng_tensor = tf.ragged.constant(eng_ids, dtype=tf.int32)
    asl_tensor = tf.ragged.constant(asl_ids, dtype=tf.int32)
    
    dataset = tf_data.Dataset.from_tensor_slices((eng_tensor, asl_tensor))
    dataset = dataset.batch(BATCH_SIZE)
    dataset = dataset.map(preprocess_batch, num_parallel_calls=tf_data.AUTOTUNE)
    return dataset.shuffle(2048).prefetch(16).cache()

train_ds = make_dataset(train_pairs)
val_ds = make_dataset(val_pairs)
print(train_ds)

2025-07-17 12:36:32.929244: I metal_plugin/src/device/metal_device.cc:1154] Metal device set to: Apple M2
2025-07-17 12:36:32.929446: I metal_plugin/src/device/metal_device.cc:296] systemMemory: 8.00 GB
2025-07-17 12:36:32.929454: I metal_plugin/src/device/metal_device.cc:313] maxCacheSize: 2.67 GB
I0000 00:00:1752770192.929908 12217163 pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
I0000 00:00:1752770192.930333 12217163 pluggable_device_factory.cc:271] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


<CacheDataset element_spec=({'encoder_inputs': TensorSpec(shape=(None, 71), dtype=tf.int32, name=None), 'decoder_inputs': TensorSpec(shape=(None, 71), dtype=tf.int32, name=None)}, TensorSpec(shape=(None, 71), dtype=tf.int32, name=None))>


In [18]:
for inputs, targets in train_ds.take(1):
    print(f'inputs["encoder_inputs"].shape: {inputs["encoder_inputs"].shape}')
    print(f'inputs["decoder_inputs"].shape: {inputs["decoder_inputs"].shape}')
    print(f"targets.shape: {targets.shape}")

inputs["encoder_inputs"].shape: (32, 71)
inputs["decoder_inputs"].shape: (32, 71)
targets.shape: (32, 71)


2025-07-17 12:36:33.755218: W tensorflow/core/kernels/data/cache_dataset_ops.cc:916] The calling iterator did not fully read the dataset being cached. In order to avoid unexpected truncation of the dataset, the partially cached contents of the dataset  will be discarded. This can happen if you have an input pipeline similar to `dataset.cache().take(k).repeat()`. You should use `dataset.take(k).cache().repeat()` instead.
2025-07-17 12:36:33.756170: I tensorflow/core/framework/local_rendezvous.cc:407] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


In [19]:
# Encoder
encoder_inputs = keras.Input(shape=(None,), name="encoder_inputs")

x = keras_hub.layers.TokenAndPositionEmbedding(
    vocabulary_size=num_encoder_tokens,
    sequence_length=MAX_SEQUENCE_LENGTH,
    embedding_dim=EMBED_DIM,
)(encoder_inputs)

encoder_outputs = keras_hub.layers.TransformerEncoder(
    intermediate_dim=INTERMEDIATE_DIM, num_heads=NUM_HEADS
)(inputs=x)
encoder = keras.Model(encoder_inputs, encoder_outputs)


# Decoder
decoder_inputs = keras.Input(shape=(None,), name="decoder_inputs")
encoded_seq_inputs = keras.Input(shape=(None, EMBED_DIM), name="decoder_state_inputs")

x = keras_hub.layers.TokenAndPositionEmbedding(
    vocabulary_size=num_decoder_tokens,
    sequence_length=MAX_SEQUENCE_LENGTH,
    embedding_dim=EMBED_DIM,
)(decoder_inputs)

x = keras_hub.layers.TransformerDecoder(
    intermediate_dim=INTERMEDIATE_DIM, num_heads=NUM_HEADS
)(decoder_sequence=x, encoder_sequence=encoded_seq_inputs)
x = keras.layers.Dropout(0.5)(x)
decoder_outputs = keras.layers.Dense(num_decoder_tokens, activation="softmax")(x)
decoder = keras.Model(
    [
        decoder_inputs,
        encoded_seq_inputs,
    ],
    decoder_outputs,
)
decoder_outputs = decoder([decoder_inputs, encoder_outputs])

transformer = keras.Model(
    [encoder_inputs, decoder_inputs],
    decoder_outputs,
    name="transformer",
)

In [20]:
transformer.summary()
transformer.compile(
    "rmsprop", loss="sparse_categorical_crossentropy", metrics=["accuracy"]
)
transformer.fit(train_ds, epochs=EPOCHS, validation_data=val_ds)

Epoch 1/30


2025-07-17 12:36:35.470491: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:117] Plugin optimizer for device_type GPU is enabled.


[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 139ms/step - accuracy: 0.8175 - loss: 2.4179 - val_accuracy: 0.8841 - val_loss: 0.7673
Epoch 2/30
[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 117ms/step - accuracy: 0.8914 - loss: 0.7135 - val_accuracy: 0.8860 - val_loss: 0.7242
Epoch 3/30
[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 133ms/step - accuracy: 0.8945 - loss: 0.6669 - val_accuracy: 0.8887 - val_loss: 0.7017
Epoch 4/30
[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 137ms/step - accuracy: 0.8972 - loss: 0.6373 - val_accuracy: 0.8901 - val_loss: 0.6874
Epoch 5/30
[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 122ms/step - accuracy: 0.8991 - loss: 0.6143 - val_accuracy: 0.8905 - val_loss: 0.6795
Epoch 6/30
[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 129ms/step - accuracy: 0.8999 - loss: 0.5977 - val_accuracy: 0.8911 - val_loss: 0.6728
Epoch 7/30
[1m75/75[0m [32m━━━━━

<keras.src.callbacks.history.History at 0x16b887e50>

In [41]:
def decode_sequences(input_sentences):
    with tf.device('/CPU:0'):
        batch_size = 1

        # Tokenize the encoder input.
        encoder_input_tokens = ops.convert_to_tensor(eng_tokenizer(input_sentences))
        if len(encoder_input_tokens[0]) < MAX_SEQUENCE_LENGTH:
            pads = ops.full((1, MAX_SEQUENCE_LENGTH - len(encoder_input_tokens[0])), 0)
            encoder_input_tokens = ops.concatenate(
                [encoder_input_tokens, pads], 1
            )

        # Define a function that outputs the next token's probability given the
        # input sequence.
        def next(prompt, cache, index):
            logits = transformer([encoder_input_tokens, prompt])[:, index - 1, :]
            # Ignore hidden states for now; only needed for contrastive search.
            hidden_states = None
            return logits, hidden_states, cache

        # Build a prompt of length 40 with a start token and padding tokens.
        length = MAX_SEQUENCE_LENGTH
        start = ops.full((batch_size, 1), asl_tokenizer.token_to_id("[START]"))
        pad = ops.full((batch_size, length - 1), asl_tokenizer.token_to_id("[PAD]"))
        prompt = ops.concatenate((start, pad), axis=-1)

        generated_tokens = keras_hub.samplers.ContrastiveSampler(k=25, alpha=0.75)(
        next,
        prompt,
        stop_token_ids=[asl_tokenizer.token_to_id("[END]")],
        index=1,
    )
        
        generated_tokens = generated_tokens.numpy().tolist()[0]
        generated_sentences = asl_tokenizer.detokenize(generated_tokens)
        return generated_sentences

outputs = []

test_eng_texts = [pair[0] for pair in test_pairs]
for i in range(50):
    output_pairs = []
    input_sentence = random.choice(test_eng_texts)
    translated = decode_sequences([input_sentence])
    translated = (
        translated.replace("[PAD]", "")
        .replace("[START]", "")
        .replace("[END]", "")
        .strip()
    )
    output_pairs.append(input_sentence)
    output_pairs.append(translated)
    outputs.append(output_pairs)
    
df = pd.DataFrame(outputs, columns=["input sentence", "translation"])
df.to_csv("/Users/adrianajimenez/Desktop/Downloads/REUAICT/Real-Code/2025-ASL-data/seq2seq_code/word_level/k25.txt", index=False)

ValueError: `ContrastiveSampler` requires passing a `hidden_states`, butreceived `None`.

In [None]:
rouge_1 = keras_hub.metrics.RougeN(order=1)
rouge_2 = keras_hub.metrics.RougeN(order=2)

for test_pair in test_pairs:
    input_sentence = test_pair[0]
    reference_sentence = test_pair[1]

    translated_sentence = decode_sequences([input_sentence])
    translated_sentence = translated_sentence[0]
    translated_sentence = (
        translated_sentence.replace("[PAD]", "")
        .replace("[START]", "")
        .replace("[END]", "")
        .strip()
    )

    rouge_1(reference_sentence, translated_sentence)
    rouge_2(reference_sentence, translated_sentence)

print("ROUGE-1 Score: ", rouge_1.result())
print("ROUGE-2 Score: ", rouge_2.result())