In [1]:
import yaml
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained('allenai/OLMoE-1B-7B-0924', add_eos_token = False, add_bos_token = False)

In [3]:
with open("./../../data/contextual-tokens/samples_match.yaml", "r", encoding="utf-8") as f:
    token_info = yaml.safe_load(f)[0]

test_token = token_info['token']
test_meanings = token_info['meanings']

# Check that 'test_token' is recognized by the tokenizer as exactly one token
encoded_token = tokenizer(test_token, add_special_tokens = False)
if len(encoded_token["input_ids"]) != 1:
    print(f'Warning: {test_token} is not tokenized as a single token, skipping')
test_token_id = encoded_token["input_ids"][0]

# For each meaning, sum how many occurrences of token_id appear across all text_samples
all_meanings_ok = True
for meaning in test_meanings:
    total_occurrences = 0
    for sample_text in meaning['text_samples']:
        encoded_sample = tokenizer(sample_text, add_special_tokens = False, truncation = True, max_length = 512)
        total_occurrences += encoded_sample["input_ids"].count(test_token_id)
    if total_occurrences < 20:
        all_meanings_ok = False
        print('Error - less than 20 valid meanings')
# If any meaning fails the minimum occurrences, discard this file
if not all_meanings_ok:
    print(f'Warning: Minimum token count not met for token {test_token}')

In [4]:
test_str = tokenizer.encode("function generateReport(data) {\n    let total = 0;{\n        console.log(\"High total: \" + total);\n    } else {\n        console.log(\"Normal total: \" + total);\n    }\n    return total;\n}\n// This is just a sample block that sums values using a { in JavaScript.")

for j in test_str:
    print(str(j) + ' | ' + tokenizer.decode(j))

3701 | function
6635 |  generate
20684 | Report
9 | (
2203 | data
10 | )
551 |  {
187 | 

50274 |     
1059 | let
2264 |  total
426 |  =
470 |  0
28 | ;
92 | {
187 | 

50270 |         
22452 | console
15 | .
2808 | log
1587 | ("
12412 | High
2264 |  total
27 | :
346 |  "
559 |  +
2264 |  total
558 | );
187 | 

50274 |     
94 | }
2010 |  else
551 |  {
187 | 

50270 |         
22452 | console
15 | .
2808 | log
1587 | ("
20167 | Normal
2264 |  total
27 | :
346 |  "
559 |  +
2264 |  total
558 | );
187 | 

50274 |     
94 | }
187 | 

50274 |     
2309 | return
2264 |  total
28 | ;
187 | 

94 | }
187 | 

605 | //
831 |  This
310 |  is
816 |  just
247 |  a
3410 |  sample
2972 |  block
326 |  that
22661 |  sums
2193 |  values
970 |  using
247 |  a
551 |  {
275 |  in
16872 |  JavaScript
15 | .
