In [1]:
import yaml
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained('allenai/OLMoE-1B-7B-0924', add_eos_token = False, add_bos_token = False)

In [23]:
with open("./../../datasets/contextual-tokens/samples_build.yaml", "r", encoding="utf-8") as f:
    token_info = yaml.safe_load(f)[0]

test_token = token_info['token']
test_meanings = token_info['meanings']

# Check that 'test_token' is recognized by the tokenizer as exactly one token
encoded_token = tokenizer(test_token, add_special_tokens = False)
if len(encoded_token["input_ids"]) != 1:
    print(f'Warning: {test_token} is not tokenized as a single token, skipping')
test_token_id = encoded_token["input_ids"][0]

# For each meaning, sum how many occurrences of token_id appear across all text_samples
all_meanings_ok = True
for meaning in test_meanings:
    total_occurrences = 0
    for sample_text in meaning['text_samples']:
        encoded_sample = tokenizer(sample_text, add_special_tokens = False, truncation = True, max_length = 512)
        total_occurrences += encoded_sample["input_ids"].count(test_token_id)

    if total_occurrences < 40:
        all_meanings_ok = False
        print('Error - less than 40 valid meanings')
    else:
        print(f'Total count {str(total_occurrences)}')
# If any meaning fails the minimum occurrences, discard this file
if not all_meanings_ok:
    print(f'Warning: Minimum token count not met for token {test_token}')

Total count 139
Total count 155
Total count 162


In [21]:
test_str = tokenizer.encode("hello ?")

for j in test_str:
    print(str(j) + ' | ' + tokenizer.decode(j))

25521 | hello
3736 | ?


In [None]:
tokenizer.encode()