# Convert new .model and .json files to huggingface tokenizer

In [2]:
import json
from transformers import AutoTokenizer
from train_tokenizer import read_training_info
from efficient_tokenization import tokenize_simple


  from .autonotebook import tqdm as notebook_tqdm


In [None]:

def convert_tokenizer_to_huggingface(new_tokenizer_info_path: str, original_tokenizer: str):
    # must get original tokenizer from huggingface
    tokenizer = AutoTokenizer.from_pretrained(original_tokenizer)
    # load new_tokenizer_info_path
    new_tokenizer_info = read_training_info(new_tokenizer_info_path)
    # get merges and new_tokens
    merges = new_tokenizer_info["merges"]
    new_tokens = new_tokenizer_info["new_tokens"]
    # add new tokens to tokenizer
    tokenizer.add_tokens(new_tokens)
    return tokenizer

info_path = "/cmlscratch/astein0/efficient_tokenization_for_inference/tokenizers/Llama-3.2-tokenizer-genqa-math-empty-start/new_mergeable_ranks_128266.json"
original_tokenizer = "meta-llama/Llama-3.2-1B"

# tok = convert_tokenizer_to_huggingface(info_path, original_tokenizer)

In [None]:
def convert_tokenizer_to_huggingface_correct(new_tokenizer_info_path: str, original_tokenizer: str):
    # must get original tokenizer from huggingface
    tokenizer = AutoTokenizer.from_pretrained(original_tokenizer)
    tokenizer_json = json.loads(tokenizer._tokenizer.to_str())
    
    # old_vocab = tokenizer_json["model"]["vocab"]
    # starting_index = len(tokenizer.get_vocab())
    old_merges = tokenizer_json["model"]["merges"]

    # Extract vocab (token: index)
    old_vocab = tokenizer.get_vocab()
    
    # load new_tokenizer_info_path
    new_tokenizer_info = read_training_info(new_tokenizer_info_path)
    # get merges and new_tokens
    new_merges = new_tokenizer_info["merges"]
    new_tokens = new_tokenizer_info["new_tokens"]

    # Update vocab (append at the next available ID)
    new_vocab = {**old_vocab}  # Copy the old vocab
    starting_index = max(old_vocab.values()) + 1

    for i, token in enumerate(new_tokens):
        new_vocab[token] = starting_index + i

    new_vocab_sorted = dict(sorted(new_vocab.items(), key=lambda item: item[1]))
    # new_vocab_sorted = new_vocab

    joined_merges = [x for x in old_merges]
    joined_merges.extend(new_merges)

    added_tokens = tokenizer.get_added_vocab()
    add_tok_ids = [tok_id for tok, tok_id in added_tokens.items()]

    new_vocab_sorted_no_added = {tok: tok_id for  tok, tok_id in new_vocab_sorted.items() if tok_id not in add_tok_ids}

    new_tokenizer_info = {**tokenizer_json}
    new_tokenizer_info["model"]["vocab"] = new_vocab_sorted_no_added
    new_tokenizer_info["model"]["merges"] = joined_merges

    # add new tokens to tokenizer
    return new_tokenizer_info

info_path = "/cmlscratch/astein0/efficient_tokenization_for_inference/tokenizers/Llama-3.2-tokenizer-genqa-math-empty-start/new_mergeable_ranks_128266.json"
original_tokenizer_str = "meta-llama/Llama-3.2-1B"

new_tokenizer_json = convert_tokenizer_to_huggingface_correct(info_path, original_tokenizer_str)



In [None]:
import os

# 2️⃣ Define new save path
new_tokenizer_path = "tokenizers/new_custom_tokenizer"
os.makedirs(new_tokenizer_path, exist_ok=True)

original_tokenizer = AutoTokenizer.from_pretrained(original_tokenizer_str)

# 3️⃣ Save the tokenizer to a directory
original_tokenizer.save_pretrained(new_tokenizer_path)

tokenizer_json_path = os.path.join(new_tokenizer_path, "tokenizer.json")
with open(tokenizer_json_path, "w") as f:
    json.dump(new_tokenizer_json, f, indent=2)



In [None]:
# 5️⃣ Reload tokenizer to verify it works
new_tokenizer = AutoTokenizer.from_pretrained(new_tokenizer_path)

# 6️⃣ Check vocab size to confirm new tokens were added
print(f"✅ New tokenizer loaded! Vocab size: {len(new_tokenizer.get_vocab())}")


In [None]:
original_tokenizer = "meta-llama/Llama-3.2-1B"
tokenizer = AutoTokenizer.from_pretrained(original_tokenizer)
tokenizer_json = json.loads(tokenizer._tokenizer.to_str())
    
old_vocab = tokenizer_json["model"]["vocab"]
starting_index = len(tokenizer)


In [None]:
pre_tok = load_pretokenizer("empty")



In [None]:
def pretokenizer_to_config(pre_tok):
    # Convert the pretokenizer string representation to a config dict
    str_rep = str(pre_tok)
    # Remove the outer Sequence() wrapper
    inner = str_rep[len("Sequence(pretokenizers=["):-2]
    
    # Parse ByteLevel config
    # Format is: ByteLevel(add_prefix_space=False, trim_offsets=True, use_regex=False)
    byte_level_config = {}
    byte_level_params = inner[len("ByteLevel("):-1].split(", ")
    for param in byte_level_params:
        key, value = param.split("=")
        byte_level_config[key] = value.lower() == "true"  # Convert string to boolean
    
    config = {
        "type": "Sequence",
        "pretokenizers": [{
            "type": "ByteLevel",
            **byte_level_config
        }]
    }
    return config

# Use it like this:
config = pretokenizer_to_config(pre_tok)
config

In [None]:
{"pre_tokenizer": {
    "type": "Sequence",
    "pretokenizers": [
      {
        "type": "Split",
        "pattern": {
          "Regex": "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+"
        },
        "behavior": "Isolated",
        "invert": false
      },
      {
        "type": "ByteLevel",
        "add_prefix_space": false,
        "trim_offsets": true,
        "use_regex": false
      }
    ]
}

In [None]:
def pretokenizer_to_config(pre_tok):
    """Convert any pretokenizer to a config dictionary by parsing its string representation."""
    str_rep = str(pre_tok)
    
    def parse_params(param_str):
        """Parse parameters from string representation into a dictionary."""
        params = {}
        # Handle empty parameters
        if not param_str:
            return params
            
        for param in param_str.split(", "):
            key, value = param.split("=")
            # Try to convert value to appropriate type
            if value.lower() in ['true', 'false']:
                params[key] = value.lower() == 'true'
            elif value.isdigit():
                params[key] = int(value)
            elif value.replace('.', '').isdigit():
                params[key] = float(value)
            else:
                # Remove quotes if present
                params[key] = value.strip("'\"")
        return params
    
    def parse_pretokenizer(tok_str):
        """Recursively parse a pretokenizer string into a config dictionary."""
        # Find the type and parameters
        tok_type = tok_str[:tok_str.find("(")]
        param_str = tok_str[tok_str.find("(")+1:tok_str.rfind(")")]
        
        # Handle nested pretokenizers (like in Sequence)
        if tok_type == "Sequence":
            # Extract the list of pretokenizers
            pretok_list = param_str[len("pretokenizers=["):-1]
            # Split on "), " but keep the closing parenthesis
            nested_toks = []
            current = ""
            paren_count = 0
            for char in pretok_list:
                if char == "(":
                    paren_count += 1
                elif char == ")":
                    paren_count -= 1
                current += char
                if paren_count == 0 and char == ")":
                    nested_toks.append(current)
                    current = ""
                elif paren_count == 0 and char == ",":
                    current = ""
                
            return {
                "type": tok_type,
                "pretokenizers": [parse_pretokenizer(tok.strip()) for tok in nested_toks if tok.strip()]
            }
        else:
            # Regular pretokenizer
            return {
                "type": tok_type,
                **parse_params(param_str)
            }
    
    return parse_pretokenizer(str_rep)

# Example usage:
from tokenizers import pre_tokenizers

# Test with different pretokenizer configurations
pretok_configs = [
    pre_tokenizers.ByteLevel(add_prefix_space=False, trim_offsets=True),
    pre_tokenizers.Whitespace(),
    pre_tokenizers.Sequence([
        pre_tokenizers.ByteLevel(add_prefix_space=False),
        pre_tokenizers.Whitespace()
    ]),
    pre_tokenizers.Metaspace(replacement="▁"),
    pre_tokenizers.Digits(individual_digits=True),
    load_pretokenizer("empty"),
    load_pretokenizer("llama3"),
]

# Test each configuration
for pre_tok in pretok_configs:
    config = pretokenizer_to_config(pre_tok)
    print(f"\nPretokenizer: {pre_tok}")
    print(f"Config: {config}")

 # DUPLICATING the dataset

In [7]:
from transformers import AutoTokenizer
import psutil
import datasets

batch_size = 500

try:
    threads = min(psutil.cpu_count(logical=False), len(psutil.Process().cpu_affinity()))
except:
    threads = os.cpu_count()

import logging
log = logging.getLogger(__name__)

logging.basicConfig(
    level=logging.INFO,  # Set the minimum log level
    format="%(asctime)s - %(levelname)s - %(message)s",  # Include time, level, and message
    datefmt="%Y-%m-%d %H:%M:%S"  # Specify the date and time format
)

# Load the model and tokenizer
log.info("Loading model and tokenizer...")
model_name = "meta-llama/Llama-3.2-1B"
# new_tokenizer_path = "/cmlscratch/astein0/efficient_tokenization_for_inference/tokenizers/Llama-3.2-tokenizer-genqa-math-empty-start-1000"
new_tokenizer_path ="/cmlscratch/astein0/efficient_tokenization_for_inference/tokenizers/Llama-3.2-tokenizer-genqa-math-empty-start-10"

log.info("Loading base tokenizer...")
base_tokenizer = AutoTokenizer.from_pretrained(model_name)
log.info("Loading extended tokenizer...")
extended_tokenizer = AutoTokenizer.from_pretrained(new_tokenizer_path)

log.info("Loading dataset...")
ds_path = "/fs/cml-projects/llm-pretraining/datasets/raw/genqa/math"
ds = datasets.load_from_disk(ds_path)

log.info("Getting genqa data...")
ds = tokenize_simple.get_genqa_data(ds, track_role=True, batch_size=batch_size, threads=threads)
# log.info("Creating translation dataset...")
# ds = create_translation_dataset(ds, base_tokenizer, extended_tokenizer, batch_size, threads)

2025-03-20 01:57:58 - INFO - Loading model and tokenizer...
2025-03-20 01:57:58 - INFO - Loading base tokenizer...
2025-03-20 01:57:59 - INFO - Loading extended tokenizer...
2025-03-20 01:58:00 - INFO - Loading dataset...
2025-03-20 01:58:00 - INFO - Getting genqa data...


In [4]:
import importlib
importlib.reload(tokenize_simple)
print(ds)

log.info("Creating translation dataset...")
# ds = tokenize_simple.create_translation_dataset(ds, base_tokenizer, extended_tokenizer, 2, threads)
ds = tokenize_simple.create_translation_dataset_with_template(ds, base_tokenizer, new_tokenizer_path, 2, threads)

2025-03-05 21:39:51 - INFO - Creating translation dataset...


Dataset({
    features: ['text'],
    num_rows: 515509
})
The OrderedVocab you are attempting to save contains holes for indices [128000, 128001, 128002, 128003, 128004, 128005, 128006, 128007, 128008, 128009, 128010, 128011, 128012, 128013, 128014, 128015, 128016, 128017, 128018, 128019, 128020, 128021, 128022, 128023, 128024, 128025, 128026, 128027, 128028, 128029, 128030, 128031, 128032, 128033, 128034, 128035, 128036, 128037, 128038, 128039, 128040, 128041, 128042, 128043, 128044, 128045, 128046, 128047, 128048, 128049, 128050, 128051, 128052, 128053, 128054, 128055, 128056, 128057, 128058, 128059, 128060, 128061, 128062, 128063, 128064, 128065, 128066, 128067, 128068, 128069, 128070, 128071, 128072, 128073, 128074, 128075, 128076, 128077, 128078, 128079, 128080, 128081, 128082, 128083, 128084, 128085, 128086, 128087, 128088, 128089, 128090, 128091, 128092, 128093, 128094, 128095, 128096, 128097, 128098, 128099, 128100, 128101, 128102, 128103, 128104, 128105, 128106, 128107, 128108

Traceback (most recent call last):
  File "/nfshomes/astein0/.pyenv/versions/3.10.4/envs/eff-tok/lib/python3.10/site-packages/multiprocess/process.py", line 314, in _bootstrap
    self.run()
  File "/nfshomes/astein0/.pyenv/versions/3.10.4/envs/eff-tok/lib/python3.10/site-packages/multiprocess/process.py", line 108, in run
    self._target(*self._args, **self._kwargs)
  File "/nfshomes/astein0/.pyenv/versions/3.10.4/envs/eff-tok/lib/python3.10/site-packages/multiprocess/managers.py", line 599, in _run_server
    server.serve_forever()
  File "/nfshomes/astein0/.pyenv/versions/3.10.4/envs/eff-tok/lib/python3.10/site-packages/multiprocess/managers.py", line 184, in serve_forever
    sys.exit(0)
SystemExit: 0

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/nfshomes/astein0/.pyenv/versions/3.10.4/envs/eff-tok/lib/python3.10/site-packages/multiprocess/util.py", line 300, in _run_finalizers
    finalizer()
  File "/nfshomes/as

In [None]:
import datasets
ds_path = "/cmlscratch/astein0/efficient_tokenization_for_inference/datasets/translation_tokenized"
ds = datasets.load_from_disk(ds_path)

# converting tokenizer to fewer tokens added

In [2]:
from transformers import AutoTokenizer

tokenizer_path = "/cmlscratch/astein0/efficient_tokenization_for_inference/tokenizers/Llama-3.2-tokenizer-genqa-math-empty-start-1000"

tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)
print(f"Loaded HF tokenizer from vocab file: {tokenizer_path}")
print(len(tokenizer.get_vocab()))

Loaded HF tokenizer from vocab file: /cmlscratch/astein0/efficient_tokenization_for_inference/tokenizers/Llama-3.2-tokenizer-genqa-math-empty-start-1000
129256


In [5]:
import os
import json
import efficient_tokenization.tokenization_utils

import importlib
importlib.reload(efficient_tokenization.tokenization_utils)

with open(os.path.join(tokenizer_path, "training_info.json"), "r") as f:
    tokenizer_json = json.load(f)

print(tokenizer_json.keys())
print(len(tokenizer_json["merges"]))
print(len(tokenizer_json["sizes"]))
print(len(tokenizer_json["new_tokens"]))
print(tokenizer_json["static_info"])
print(tokenizer_json["state"])


NameError: name 'tokenizer_path' is not defined

In [4]:
import importlib
importlib.reload(efficient_tokenization.tokenization_utils)
new_save_path = "/cmlscratch/astein0/efficient_tokenization_for_inference/tokenizers/testing"
sm = efficient_tokenization.tokenization_utils.SaveModule.from_path(old_path=tokenizer_path, new_path=new_save_path)

NameError: name 'efficient_tokenization' is not defined

In [9]:
merges = tokenizer_json["merges"]
sizes = tokenizer_json["sizes"]
# additional_info = tokenizer_json["additional_info"]
num_added_tokens = 100

sm.shrink_tokenizer(merges, sizes, num_added_tokens=num_added_tokens)

Saving tokenizer to /cmlscratch/astein0/efficient_tokenization_for_inference/tokenizers/testing


In [None]:
import efficient_tokenization

tokenizer_path1 = "meta-llama/Llama-3.1-8B-Instruct"
tokenizer_path2 = "meta-llama/Llama-3.2-3B-Instruct"
tok1 = AutoTokenizer.from_pretrained(tokenizer_path1)
tok2 = AutoTokenizer.from_pretrained(tokenizer_path2)

vocab1 = tok1.get_vocab()
vocab2 = tok2.get_vocab()

efficient_tokenization.tokenization_utils.compare_dicts(vocab1, vocab2)






True

# Now run the truncation for a bunch of sizes

In [None]:
import importlib
importlib.reload(efficient_tokenization.tokenization_utils)
# sm = efficient_tokenization.tokenization_utils.SaveModule.from_path(old_path=tokenizer_path)
merges = tokenizer_json["merges"]
sizes = tokenizer_json["sizes"]
# additional_info = tokenizer_json["additional_info"]
num_new_tokens_list = [1, 5, 20, 50, 100, 200, 300, 400, 500, 600, 700, 800, 900]
for num_new_tokens in num_new_tokens_list:
    path = efficient_tokenization.tokenization_utils.get_new_path(num_new_tokens, tokenizer_path)
    sm.shrink_tokenizer(merges, sizes, num_new_tokens=num_new_tokens, new_path=path)



Setting save_loc to /cmlscratch/astein0/efficient_tokenization_for_inference/tokenizers/Llama-3.2-tokenizer-genqa-math-empty-start-1
Saving tokenizer to /cmlscratch/astein0/efficient_tokenization_for_inference/tokenizers/Llama-3.2-tokenizer-genqa-math-empty-start-1
Setting save_loc to /cmlscratch/astein0/efficient_tokenization_for_inference/tokenizers/Llama-3.2-tokenizer-genqa-math-empty-start-5
Saving tokenizer to /cmlscratch/astein0/efficient_tokenization_for_inference/tokenizers/Llama-3.2-tokenizer-genqa-math-empty-start-5
Setting save_loc to /cmlscratch/astein0/efficient_tokenization_for_inference/tokenizers/Llama-3.2-tokenizer-genqa-math-empty-start-20
Saving tokenizer to /cmlscratch/astein0/efficient_tokenization_for_inference/tokenizers/Llama-3.2-tokenizer-genqa-math-empty-start-20
Setting save_loc to /cmlscratch/astein0/efficient_tokenization_for_inference/tokenizers/Llama-3.2-tokenizer-genqa-math-empty-start-50
Saving tokenizer to /cmlscratch/astein0/efficient_tokenization_for