In [1]:
import keras_hub
import pathlib
import random

import keras
from keras import ops

import tensorflow.data as tf_data

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from pyparsing import Word, alphas, nums
import pyparsing as pp

In [3]:
handshapelist = [
  '1',
  '3',
  '4',
  '5',
  '6',
  '7',
  '8',
  '9',
  '10',
  '25',
  'A',
  'B',
  'C',
  'D',
  'E',
  'F',
  'G',
  'H-U',
  'I',
  'K',
  'L',
  'M',
  'N',
  'O',
  'R',
  'S',
  'T',
  'U',
  'V',
  'W',
  'X',
  'Y',
  'C-L',
  'U-L',
  'B-L',
  'P-K',
  'Q-G',
  'L-X',
  'I-L-Y',
  '5-C',
  '5-C-L',
  '5-C-tt',
  'alt-M',
  'alt-N',
  'alt-P',
  'B-xd',
  'baby-O',
  'bent-1',
  'bent-B',
  'bent-B-L',
  'bent-horns',
  'bent-L',
  'bent-M',
  'bent-N',
  'bent-U',
  'cocked-8',
  'cocked-F',
  'cocked-S',
  'crvd-5',
  'crvd-B',
  'crvd-flat-B',
  'crvd-L',
  'crvd-sprd-B',
  'crvd-U',
  'crvd-V',
  'fanned-flat-O',
  'flat-B',
  'flat-C',
  'flat-F',
  'flat-G',
  'flat-O-2',
  'flat-O',
  'full-M',
  'horns',
  'loose-E',
  'O-2-horns',
  'open-8',
  'open-F',
  'sml-C-3',
  'tight-C-2',
  'tight-C',
  'X-over-thumb'
]

In [4]:
cl_prefix = pp.one_of(["CL", "DCL", "LCL", "SCL", "BCL", "BPCL", "PCL", "ICL"])
ns_prefix = pp.Literal("ns")
fs_prefix = pp.Literal("fs")
lex_exceptions = pp.one_of(["part", "WHAT"])
aspect_text = pp.Literal("aspect")
index_core_ix = pp.Literal("IX")
other_index_core = pp.one_of(["POSS", "SELF"])
handshape = pp.one_of(handshapelist)
person = pp.one_of(["1p", "2p", "3p"])
dash = pp.Literal("-")
arc = pp.Literal("arc") 
loc = pp.Literal("loc")
pl = pp.Literal("pl")
compound = pp.Literal("+")
hashtag = pp.Literal("#")
choice = pp.Literal("/")
sym = pp.Literal(">")
par1 = pp.Literal("(")
par2 = pp.Literal(")")
contraction = pp.Literal("^")
colon = pp.Literal(":")
omit_quote = pp.Literal("xx")
period = pp.Literal(".")
alphas = pp.Word(alphas, max=1)
nums = pp.Word(nums, max=1)

In [5]:
full_grammar = pp.OneOrMore(
    cl_prefix |               # classifiers like CL, DCL, etc.
    ns_prefix |               # non-specific ns
    fs_prefix |               # fingerspelling fs
    index_core_ix |           # IX
    other_index_core |        # POSS, SELF
    person |                  # 1p, 2p, 3p
    lex_exceptions |          # part, WHAT
    aspect_text |             # aspect
    arc |                     # arc
    loc |                     # loc
    pl |                      # pl
    handshape |               # handshapes like B, 1, 5, etc.
    compound |                # +
    hashtag |                 # #
    choice |                  # /
    sym |                     # >
    contraction |             # ^
    colon |                   # :
    dash |                    # -
    par1 | par2 |             # ( and )
    omit_quote |              # xx
    period |                  # .
    nums |                    # numbers last resort
    alphas                    # fallback LAST
)

In [6]:
trial = full_grammar.parse_string("SCL:1xx", parse_all=True).asList()
trial2 = full_grammar.parse_string("IX-1p-pl-2 WORK LANDSCAPE fs-LANDSCAPING IX-1p 5xx", parse_all=True).as_list()

print(trial)
print(trial2)

['SCL', ':', '1', 'xx']
['IX', '-', '1p', '-', 'pl', '-', '2', 'W', 'O', 'R', 'K', 'L', 'A', 'N', 'D', 'S', 'C', 'A', 'P', 'E', 'fs', '-', 'L', 'A', 'N', 'D', 'S', 'C', 'A', 'P', 'I', 'N', 'G', 'IX', '-', '1p', '5', 'xx']


In [7]:
def custom_tokenize(text):
    try:
        return full_grammar.parse_string(text, parse_all=True).as_list()
    except pp.ParseException as pe:
        print(f"Failed to parse: {pe}")
        return []

In [8]:
trial = custom_tokenize("SCL:1xx")
trial2 = custom_tokenize("IX-1p-pl-2 WORK LANDSCAPE fs-LANDSCAPING IX-1p 5xx")

print(trial)
print(trial2)

['SCL', ':', '1', 'xx']
['IX', '-', '1p', '-', 'pl', '-', '2', 'W', 'O', 'R', 'K', 'L', 'A', 'N', 'D', 'S', 'C', 'A', 'P', 'E', 'fs', '-', 'L', 'A', 'N', 'D', 'S', 'C', 'A', 'P', 'I', 'N', 'G', 'IX', '-', '1p', '5', 'xx']


In [9]:
BATCH_SIZE = 64
EPOCHS = 30  # This should be at least 10 for convergence
MAX_SEQUENCE_LENGTH = 50
ENG_VOCAB_SIZE = 15000
ASL_VOCAB_SIZE = 2000
num_samples = 1400

EMBED_DIM = 128
INTERMEDIATE_DIM =1024
NUM_HEADS = 8
data_path = "C:\\adriana\\official-code\\2025-ASL-data\\data\\use_data.txt"

In [11]:
input_texts = []
target_texts = []
token_list = []
input_tokens = set()
target_tokens = set()
with open(data_path, "r", encoding="utf-8") as f:
    lines = f.read().split("\n")
    
for line in lines[: min(num_samples, len(lines) - 1)]:
    input_text, target_text = line.split("\t")
    tokenized_sent = custom_tokenize(target_text)
    token_list.append(tokenized_sent)
    input_texts.append(input_text)
    target_texts.append(target_text)
    for token in input_text:
        if token not in input_tokens:
            input_tokens.add(token)
            
            
for sent in token_list:
    for tok in sent:
        if tok not in target_tokens:
            target_tokens.add(tok)


input_tokens = sorted(list(input_tokens))
target_tokens = sorted(list(target_tokens))

print("input_tokens:", input_tokens)
print("output_tokens", target_tokens)
num_encoder_tokens = len(input_tokens)
num_decoder_tokens = len(target_tokens)
print("num_eng_tokens", num_encoder_tokens)
print("num_asl_tokens", num_decoder_tokens)
print(token_list)

Failed to parse: Expected end of text, found '"'  (at char 72), (line:1, col:73)
Failed to parse: Expected end of text, found '"'  (at char 22), (line:1, col:23)
Failed to parse: Expected end of text, found '"'  (at char 32), (line:1, col:33)
Failed to parse: Expected end of text, found '"'  (at char 28), (line:1, col:29)
Failed to parse: Expected end of text, found '"'  (at char 41), (line:1, col:42)
input_tokens: [' ', '!', '"', '$', "'", '(', ')', '*', ',', '-', '.', '/', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']
output_tokens ['#', '(', ')', '+', '-', '.', '/', '0', '1', '10', '1p', '2', '25', '2p', '3', '3p', '4', '5', '5-C', '5-C-L', '6', '7', '8', '9', ':', '>', 'A', 'B', 'B-L', 'BCL', 'BPCL', 'C', 'CL', '

In [12]:
parsed_sent = []
for sent in token_list:
    parsed_sent.append(" ".join(sent))

print(parsed_sent)

['IX - 1p IX - 1p R E A L L Y IX - 2p N O T I C E ( 5 ) xx # D O ( 5 ) xx', '5 xx R E A L L Y Y E S T E R D A Y IX - 1p W O R K W I T H F R I E N D IX - loc', 'fs - T O N Y IX - 1p - pl - 2 G R O W - U P POSS - 1p B E S T F R I E N D', 'IX - 1p - pl - 2 W O R K L A N D S C A P E fs - L A N D S C A P I N G IX - 1p 5 xx', 'IX - 1p - pl - 2 C U T fs - W O O D fs - C U T T E R S A M E DCL : b e n t - 5 xx W I T H M E T A L C O R N E R', 'C O R N E R R E A L L Y IX - 3p S H A R P ( Y ) W O W M E T A L W I T H B I G fs - L O G DCL : c u r v e d 5 xx', 'DCL : c u r v e d 5 xx R E A L L Y D I F F E R E N T S I Z E R E A L L Y DCL : 5 xx', 'R E A L L Y POSS - 1p B O S S P R E T T Y S T R I C T S A M E', 'S A Y 5 xx M U S T E N D # A L L B E F O R E S U N S E T', 'IX - 1p F I N E IX - 1p ICL : c u r v e d 5 xx', 'I M P A C T O N E B I G fs - L O G DCL : c u r v e d 5 xx IX - 1p BCL : c u r v e d 5 xx BCL : c u r v e d 5 xx', 'IX - 1p BCL : c u r v e d 5 xx G E T fs - I T # B A C K', 'B U T S T I

In [None]:
def normalize_and_split(x):
    "Split on whitespace."
    return tf.strings.split(x)

In [None]:
def train_word_piece(text_samples, vocab_size, reserved_tokens):
    word_piece_ds = tf_data.Dataset.from_tensor_slices(text_samples)
    vocab = keras_hub.tokenizers.compute_word_piece_vocabulary(
        word_piece_ds.batch(1000).prefetch(2),
        vocabulary_size=vocab_size,
        reserved_tokens=reserved_tokens,
    )
    return vocab

In [None]:
reserved_tokens = ["[PAD]", "[UNK]", "[START]", "[END]"]

eng_samples = [text_pair[0] for text_pair in train_pairs]
eng_vocab = train_word_piece(eng_samples, ENG_VOCAB_SIZE, reserved_tokens)

spa_samples = [text_pair[1] for text_pair in train_pairs]
spa_vocab = train_word_piece(spa_samples, SPA_VOCAB_SIZE, reserved_tokens)