In [1]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("GSAI-ML/LLaDA-8B-Instruct", trust_remote_code=True)
print("Tokenizer vocab size:", len(tokenizer.get_vocab()))


  from .autonotebook import tqdm as notebook_tqdm
None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.


Tokenizer vocab size: 126349


In [20]:
#Comparing LLaDA Vocab with NTLK
from transformers import AutoTokenizer
import nltk
from nltk.corpus import words
import re

# Download the dictionary
nltk.download('words')
english_words = set(words.words())

# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained("GSAI-ML/LLaDA-8B-Instruct", trust_remote_code=True)

# Access vocabulary
vocab = tokenizer.get_vocab()
tokens = list(vocab.keys())
total_tokens = len(tokens)

# Define a basic regex for word-like tokens (no special chars or digits)
def is_potential_word(token):
    clean = re.sub(r"[^a-zA-Z]", "", token)
    return clean.lower() in english_words

# Apply the check
word_tokens = [t for t in tokens if is_potential_word(t)]
num_word_tokens = len(word_tokens)
percent = 100 * num_word_tokens / total_tokens

print(f"Total tokens in vocab: {total_tokens}")
print(f"Tokens that match real English words: {num_word_tokens}")
print(f"Percentage: {percent:.2f}%")


[nltk_data] Downloading package words to /home/ayf4/nltk_data...
[nltk_data]   Package words is already up-to-date!


Total tokens in vocab: 126349
Tokens that match real English words: 36342
Percentage: 28.76%


In [27]:
# Compare LLaDA Vocab with NLTK Accounting Strongly For Ġ
from transformers import AutoTokenizer
import nltk
from nltk.corpus import words
import re

# Download English word list if not already available
nltk.download('words')
english_words = set(words.words())  # normalize all to lowercase

# Load the LLaDA tokenizer
tokenizer = AutoTokenizer.from_pretrained("GSAI-ML/LLaDA-8B-Instruct", trust_remote_code=True)

# Access raw token list
vocab = tokenizer.get_vocab()
tokens = list(vocab.keys())
total_tokens = len(tokens)

# Normalize tokens: strip leading Ġ and lowercase
tokens_cleaned = [t.lstrip("Ġ").lower() for t in tokens]

# Define match logic: full alphabetic word and in English dictionary
def is_potential_word(token):
    return token.isalpha() and token in english_words

def is_potential_word(token):
    clean = re.sub(r"[^a-zA-Z]", "", token)
    return clean.lower() in english_words

# Apply filter
word_tokens = [t for t in tokens_cleaned if is_potential_word(t)]
num_word_tokens = len(word_tokens)
percent = 100 * num_word_tokens / total_tokens

# Output results
print(f"Total tokens in LLaDA vocab: {total_tokens}")
print(f"Tokens matching English words: {num_word_tokens}")
print(f"Percentage English words: {percent:.2f}%")

# Sample matching tokens (optional)
import random
print("\nSample of matching English words:")
print(random.sample(word_tokens, 30))


[nltk_data] Downloading package words to /home/ayf4/nltk_data...
[nltk_data]   Package words is already up-to-date!


['meta', 'rider', 'åīļä¸º', 'twenty', 'åģłåĭ©', 'çi̇ħåħ³', 'eward', 'ä¹łæģ§', 'coe', 'é¢ģå¸ĥçļħ', 'å¤§ä½¿é¦ĩ', 'ç»¼åĳīå®ŀåĭľ', 'çľĭæ¸ħæ¥ļ', 'åıįé¦ī', 'indicted', 'ç¬¬åħ«ç«ł', 'filthy', '.**', 'å»·', 'capsule', 'é©±åĭ¨', '.check', 'åi̇łæĸĩåľ°åŀģ', 'sport', 'raising', 'åį°æľī', 'requent', 'projects', 'å¯¹æīĭ', 'æ·±å±±', 'ï¼įåĳ¸å¼ķäºĩ', 'æĭ½çĥł', 'å¤įåħ´', 'busters', 'peat', 'åħ³èģķæĸ¹', 'ted', 'è£ħ', '.shutdown', '-content', 'viewer', 'åı¯ä»¥ç»ļ', 'hdmi', 'jak', 'good', 'ç©¿è¡£æľį', 'captured', 'solving', 'toadd', 'åĳ½è¿ĳçļħ', 'complicated', 'æ¶¦æ»ĳåīĥ', 'ategories', 'pagination', 'ĉą', 'æĥĭåĸľ', 'çľĭçŀģå¥¹', 'æĭĳåī¶åīĥ', 'åīļå¤©', 'where', '-muslim', 'miserable', 'é¦ĭ', 'ä»£è¡¨æģ§çļħ', 'æĵĵè°i̇', '_scene', '.serializable', '+xml', 'parsons', 'inf', 'è®¢ç«ĭ', 'butt', 'gbp', 'vã¤', '(container', '_put', 'work', '±', 'å¿µå¿µ', 'å·¥ä½ľå²ĺä½į', 'åı¯è¢«', 'éľªçļħ', 'ãģģåĳīçĳĩ', 'fca', 'captain', 'nause', 'åºĳå±±', 'çļħä¸»è§ĵ', 'èģªæĺi̇', 'symbols', 'ãģģæķ¿æ²»', 'ï¼įè¯¢éĺ®', 'most', 'æ¿ģçĥī', 

In [29]:
#Categorizing Remaining LLaDA Tokens
import re

non_word_tokens = [t for t in tokens_cleaned if t not in word_tokens]

# Categories to track
categories = {
    'punctuation': [],
    'digits': [],
    'code_like': [],
    'subwords': [],
    'foreign_like': [],
    'symbols': [],
    'unknown': [],
}

# Regex helpers
is_digit = lambda t: re.fullmatch(r'\d+', t)
is_punct = lambda t: re.fullmatch(r'\W+', t)
is_code = lambda t: bool(re.search(r'[_<>/{};=]', t))
is_symbol = lambda t: bool(re.fullmatch(r'[^\w\s]+', t))
looks_foreign = lambda t: bool(re.search(r'[éöçñžあ語你]', t))  # crude but effective
looks_subword = lambda t: re.match(r'^##|▁|^[a-z]{1,2}$', t)  # subword prefixes or short fragments

# Categorize
for token in non_word_tokens:
    if is_digit(token):
        categories['digits'].append(token)
    elif is_punct(token):
        categories['punctuation'].append(token)
    elif is_code(token):
        categories['code_like'].append(token)
    elif is_symbol(token):
        categories['symbols'].append(token)
    elif looks_foreign(token):
        categories['foreign_like'].append(token)
    elif looks_subword(token):
        categories['subwords'].append(token)
    else:
        categories['unknown'].append(token)

# Print summary
for cat, items in categories.items():
    print(f"{cat:15s}: {len(items):5d} tokens ({len(items)/len(non_word_tokens)*100:.2f}%)")


punctuation    :  2180 tokens (2.52%)
digits         :    10 tokens (0.01%)
code_like      :  1593 tokens (1.84%)
subwords       :  2221 tokens (2.57%)
foreign_like   : 19943 tokens (23.04%)
symbols        :     0 tokens (0.00%)
unknown        : 60612 tokens (70.02%)


In [30]:
#Sampling remaining tokens
import random
print(random.sample(categories['unknown'], 500))

['åľłèģį', 'audited', 'rizz', 'îµ', 'æ¸ħåĩģ', 'arabic', 'validates', 'elsius', 'serviceimpl', 'ãģĳãģĳ', 'ï¼įæķ¾åħ¥', 'å°ģåłµ', 'atar', 'mis', 'looks', 'æľīæľ«', 'æ°¸', 'èĩ¼', '(html', 'æ·»åĭłåīĥ', 'è¡¨å§ĳ', 'ograph', 'leneck', 'payload', 'ï¼įè·¯ä¸ĭ', 'imo', 'mammals', 'alyzed', 'wednesdays', 'èµħæł¼èģĥè¯ķ', 'æīĳä»½', 'hhs', 'å°ĩæĺ¯', 'alid', 'åįķåħĥæµĭè¯ķ', '.stdout', 'ï¼įä½ĩåį´', 'eness', 'harmon', 'omnia', '\\leq', 'cular', 'suk', '.bytes', 'unve', 'employees', 'idation', 'queries', 'ãģģèįī', 'lian', 'orect', 'whatsapp', 'åįĭæīª', 'vonne', 'ffff', 'amelior', 'èĩ³å°ĳä¸ģä¸ª', 'mca', 'åĵīä½ľå¤§åń¦', 'ãģĥä»ĸè¯´', 'å£«æ°ķ', 'knobs', 'å¤©æ²³', 'kel', 'ä¿ŀå®ī', 'åń¦æľ¯äº¤æµģ', 'iran', 'cin', 'leb', 'æĭ½æł·', 'å¼ºåº¦åĵį', 'hering', 'aired', 'å°±å°ĩ', 'å¯¼èĩ´äºĩ', 'ssh', 'java', 'à¶', 'åľ¨æīĳèº«ä¸ĭ', 'å°±ä¸įèĥ½', 'æ»ĭåĳ³', '(src', 'ically', 'ï¼įåľ¨æīģè¿°', 'filepath', 'ï¼įä¸´', ',æīĳä»¬åľ¨', 'ï¼įä¸ģåıį', 'connecticut', 'ãģĥè¿ļåı¥è¯ŀ', 'â¦', 'ï¼įåĳħä¸ª', 'felix', 'xamarin', 'åĩłåįģä¸ĩ', 'bench

In [5]:
import re

# This regex allows only ASCII letters, numbers, and common symbols.
ascii_clean_pattern = re.compile(r'^[a-zA-Z0-9\-\._\+\*\?!@#%&=\[\]{}()/\\|,:;"\'<>\^`~]+$')

ascii_like = [t for t in categories['unknown'] if ascii_clean_pattern.fullmatch(t)]
ascii_like_pct = len(ascii_like) / len(categories['unknown']) * 100

print(f"Clean ASCII-like tokens: {len(ascii_like)}")
print(f"Percent of unknowns that are ASCII-like: {ascii_like_pct:.2f}%")


Clean ASCII-like tokens: 15595
Percent of unknowns that are ASCII-like: 24.11%


In [6]:
# Define what counts as "ASCII-clean": only standard letters, digits, and common symbols
ascii_clean_pattern = re.compile(r'^[a-zA-Z0-9\-\._\+\*\?!@#%&=\[\]{}()/\\|,:;"\'<>\^`~]+$')

ascii_like = []
non_ascii_like = []

for token in non_word_tokens:
    if ascii_clean_pattern.fullmatch(token):
        ascii_like.append(token)
    else:
        non_ascii_like.append(token)

# Report
ascii_like_count = len(ascii_like)
non_ascii_like_count = len(non_ascii_like)
ascii_percent = 100 * ascii_like_count / len(non_word_tokens)

print(f"\n--- ASCII-Like vs Non-ASCII ---")
print(f"ASCII-like non-word tokens   : {ascii_like_count:5d} ({ascii_percent:.2f}%)")
print(f"Non-ASCII or corrupted tokens: {non_ascii_like_count:5d} ({100 - ascii_percent:.2f}%)")



--- ASCII-Like vs Non-ASCII ---
ASCII-like non-word tokens   : 19089 (21.21%)
Non-ASCII or corrupted tokens: 70918 (78.79%)


In [31]:
import pandas as pd
from transformers import AutoTokenizer

# Load the IDF dataset
idf_df = pd.read_csv("wiki_tfidf_terms.csv")  # Adjust path as needed
idf_terms = set(idf_df['token'])

# Load LLaDA tokenizer
tokenizer = AutoTokenizer.from_pretrained("GSAI-ML/LLaDA-8B-Instruct", trust_remote_code=True)
llada_vocab = tokenizer.get_vocab()
llada_tokens = list(llada_vocab.keys())

# Compute overlap
tokens_in_wiki = [t for t in llada_tokens if t in idf_terms]
percent_overlap = 100 * len(tokens_in_wiki) / len(llada_tokens)

# Output result
print(f"LLaDA vocab tokens total        : {len(llada_tokens)}")
print(f"Tokens found in wiki_tfidf     : {len(tokens_in_wiki)}")
print(f"Percentage of LLaDA vocab in wiki IDF set: {percent_overlap:.2f}%")


LLaDA vocab tokens total        : 126349
Tokens found in wiki_tfidf     : 14028
Percentage of LLaDA vocab in wiki IDF set: 11.10%


In [32]:
import pandas as pd
from transformers import AutoTokenizer
import random

# Load the Wikipedia TF-IDF dataset
idf_df = pd.read_csv("wiki_tfidf_terms.csv")
#idf_terms = set(t.lower() for t in idf_df['token'])  # lowercase all TF-IDF terms
idf_terms = set(str(t).lower() for t in idf_df['token'].dropna())


# Load LLaDA tokenizer
tokenizer = AutoTokenizer.from_pretrained("GSAI-ML/LLaDA-8B-Instruct", trust_remote_code=True)
llada_vocab = tokenizer.get_vocab()
llada_tokens = list(llada_vocab.keys())

# Normalize LLaDA tokens: strip leading Ġ and lowercase
llada_tokens_cleaned = [t.lstrip("Ġ").lower() for t in llada_tokens]

# Compute overlap
tokens_in_wiki = [t for t in llada_tokens_cleaned if t in idf_terms]
percent_overlap = 100 * len(tokens_in_wiki) / len(llada_tokens_cleaned)

# Output
print(f"LLaDA vocab tokens total        : {len(llada_tokens_cleaned)}")
print(f"Tokens found in wiki_tfidf     : {len(tokens_in_wiki)}")
print(f"Percentage of LLaDA vocab in wiki IDF set: {percent_overlap:.2f}%")

# Sample tokens not in Wikipedia
tokens_not_in_wiki = [t for t in llada_tokens_cleaned if t not in idf_terms]
print("\nSample of LLaDA vocab tokens NOT found in Wikipedia TF-IDF set:")
print(random.sample(tokens_not_in_wiki, 50))


LLaDA vocab tokens total        : 126349
Tokens found in wiki_tfidf     : 60371
Percentage of LLaDA vocab in wiki IDF set: 47.78%

Sample of LLaDA vocab tokens NOT found in Wikipedia TF-IDF set:
['ãģĥè¿ļéĩįçļħ', 'çĥģ', 'éŀłçŀģ', 'è¾ĥå¼º', '(elem', 'ï¼įä¸įè¦ģ', 'è¾ĵçķµ', 'æķ°æį®åńĺåĥ¨', '{u', 'ambiã©n', 'å®ŀåľ°èģĥå¯ł', 'ivable', 'bulld', 'irrit', 'æĭĩéļ¤', '.junit', 'åĩħå¤ĸ', 'ãģĥå®ĥæĺ¯', '|=', 'æķ£åıĳåĩº', 'ãģģä¿ħç½ĺæĸ¯', 'ä¸ńå¸¦', 'invalidoperationexception', '_process', 'ä¼ļéģīæĭ©', 'å¹¿å¤§', 'å¿«éģłå¢ŀéķ¿', 'çķµæºĳ', 'æ¯ķè¾ĥé«ĺçļħ', '++', 'ä¿®é¥°', 'ç¡¬çľĺ', 'âģļl', 'çļħç²īä¸ŀ', 'ï¼įåľłä¸ºæīĳä»¬', 'èĳ¡èĳħç³ĸ', 'æłĳä¸ģ', 'å®¶è£ħ', '.csdn', 'çļħäºĭçī©', 'åı£æ°´', 'èī¹èī±', 'çļħé£i̇éļ©', 'è¯įæŀ¡', 'ä¹łåıªæľī', 'ç»§ç»ńè¯´éģĵ', '_msk', 'èĥ½ä½¿', 'orrow', 'ç»į']


In [33]:
import random
# Sample 50
sample_outliers = random.sample(tokens_not_in_wiki, 500)

print("\nSample of LLaDA vocab tokens NOT found in Wikipedia TF-IDF set:")
for token in sample_outliers:
    print(token)



Sample of LLaDA vocab tokens NOT found in Wikipedia TF-IDF set:
("*
ãģģéĥĵ
ï¼įä¼ģåľ¾
,åģ¼å¾ĺ
æĺł
çľĭè§ģ
ä¸ńæıĳåıĸ
è¾ĵäºĩ
×ľ
æĭķ
ãģģåĳħç§į
è¯į
,çńī
çī©ä»·
çªģå¦ĥåħ¶æŀ¥çļħ
(;
icions
åľ¨åħ¶ä»ĸ
aluronic
è£ģå®ļ
ç»¼åĳī
æŀħçńĳ
("<
æķħäºĭæĥħèĭĥ
olysis
electroph
å¯¹çħ§
ç¨ļå«©
åı¤èģģçļħ
æīĳå°±
éģıå½»
.cod
ä¸»äººåħ¬
æĺ©å°±
çī©ä¸ļ
å¤§åį«
ä¸įåī°
åi̇ĭåĭľåĵį
æĸ°åħ´
èī¯å¤ļ
à°¿
ï¼įèħ±
ãģģçļ®èĥ¤
åħ±äº§ä¸»ä¹ī
é»ħ
_screen
è¢«æĭķèµħ
åįķåħĥæµĭè¯ķ
è´¯å½»
å¼łæľľ
è®°äºĭ
(ed
"""
ä¸¾ä¸ª
_impl
æľ´æĸ°æĺ¶éĺ´
é«ĺåħ´çļħ
çļħåĵį
èººåľ¨åľ°ä¸ĭ
çīĩåıĳçļħ
,æ¿ģåıĳ
ä¸»äººçļħ
ï¼įéº»çĥ¦
ä¸i̇ç¬¬äºį
å°ıç»ħ
ä¸ńåĳ«
ä½ļ
æŀľæłĳ
åīļè¦ģ
ï¼įåī¶
($"
ä½ļ
reated
ä¸ģåīļ
çńīåľ°çļħ
ä½įç½®ä¿¡æģ¯
æ²¾æłĵ
æ²ļæ¼ł
unexpectedeof
ãģģæĭĺ
æĺł
lemented
åĩºä¸ģåī¯
\]).
è¯´å®į
ãģģæīĭæľº
.ext
æĺ¯ä¼ģä¸ļ
ïģî¹
äºįäºº
åĩºçīīæĺ¶éĺ´
è®¾æ³ķ
_load
æi̇¥çķµè¯ŀ
åį´åıĳçi̇°
ä½ıæī·
åi̇¿äººæ°ĳæķ¿åºľ
ä½łå°±æĺ¯
æŀ¸
<u
ãģĥç»¼åĳī
å®ĺåºľ
äºķæ°´
ä¸ģè§ī
éľ·
æ¸ļ
é£ŀé¸ł
éķļè¿ĩ
_stack
æĥ³è®©
jpanel
ä¸ģè¯ń

å¤§åĸŀ
åıĳå±ķ
äºĩè¿ļäºľ
ç§ĭæ°´
æĺ¯åi̇»
{s
ï¼įåħīæĺ¯
æľµ
_no
'][$
ãģģè¿ŀ
å¼