In [28]:
import os
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import psutil
plt.style.use('ggplot')

import torch
from transformers import AutoModelForCausalLM, TrainingArguments, Trainer, TextStreamer, AutoTokenizer, AutoModelForSequenceClassification

import datasets
# from unsloth.chat_templates import get_chat_template
# from unsloth import FastLanguageModel
# from datasets import Dataset
# from unsloth import is_bfloat16_supported


# Warnings
import warnings
warnings.filterwarnings("ignore")



%matplotlib inline


In [None]:
from efficient_tokenization.tokenize_simple import get_genqa_data, get_tokenized_data, flatten_genqa_conversations, my_tokenize

import logging
log = logging.getLogger(__name__)

logging.basicConfig(
    level=logging.INFO,  # Set the minimum log level
    format="%(asctime)s - %(levelname)s - %(message)s",  # Include time, level, and message
    datefmt="%Y-%m-%d %H:%M:%S"  # Specify the date and time format
)


batch_size = 500

try:
    threads = min(psutil.cpu_count(logical=False), len(psutil.Process().cpu_affinity()))
except:
    threads = os.cpu_count()


# Load the model and tokenizer
log.info("Loading model and tokenizer...")
model_name = "meta-llama/Llama-3.2-1B"

raw_data_name = "genqa"
ext = "math"
ds_path = f"/fs/cml-projects/llm-pretraining/datasets/raw/{raw_data_name}/{ext}"

pre_tok_name = "empty"
tokenizer_path_old = f"/cmlscratch/astein0/LLM-pretraining/LLM-pretraining-tokenization/tokenizers/Llama-3.2-tokenizer-genqa-{ext}-{pre_tok_name}-start"
tokenizer_file_old = "new_mergeable_ranks_2000.model"
vocab_file_path = f"{tokenizer_path_old}/{tokenizer_file_old}"

# DATASET
# dataset_path = "/fs/cml-projects/llm-pretraining/datasets/processed/ultrachat/train"
dataset_path = f"/fs/cml-projects/llm-pretraining/datasets/raw/{raw_data_name}/{ext}"

log.info("Downloading and processing raw dataset")
# tokenizer, data, tokenized_dataset = get_tokenized_data(vocab_file_path, ds_path, pre_tok_name=pre_tok_name)

# get original_tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)

# data = get_genqa_data(dataset_path)
# tokenized_dataset = my_tokenize(data.select_columns("text"), tokenizer)
# tokenized_dataset = tokenized_dataset.map(lambda batch: {"num_tokens": [len(ids) for ids in batch["input_ids"]]}, batched=True, batch_size=batch_size, num_proc=threads)




In [30]:
import efficient_tokenization.tokenize_simple as tokenize_simple
from efficient_tokenization.tokenize_simple import get_tokenized_data, flatten_genqa_conversations, my_tokenize

import importlib
importlib.reload(tokenize_simple)

model_name = "meta-llama/Llama-3.2-1B"
tokenizer = AutoTokenizer.from_pretrained(model_name)
# tokenizer.pad_token = tokenizer.eos_token

# # data = tokenize_simple.get_genqa_data(dataset_path, tokenizer=tokenizer, track_role=True)
# data = tokenize_simple.get_genqa_data(dataset_path, track_role=True)
# tokenized_dataset = my_tokenize(data.select_columns("text"), tokenizer)
# tokenized_dataset = tokenized_dataset.map(lambda batch: {"num_tokens": [len(ids) for ids in batch["input_ids"]]}, batched=True, batch_size=batch_size, num_proc=threads)



In [None]:
tokenizer

In [None]:
from transformers import DataCollatorWithPadding
from torch.utils.data import DataLoader

model_name = "meta-llama/Llama-3.2-1B"
batch_size = 4

tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token

dataset_path = "/cmlscratch/astein0/efficient_tokenization_for_inference/datasets/test"
tokenized_dataset = datasets.load_from_disk(dataset_path)

# Split the dataset into train (90%) and validation (10%)
tokenized_dataset = tokenized_dataset.train_test_split(test_size=0.1)
# tokenized_dataset[0]

tokenized_dataset["train"] = tokenized_dataset["train"].remove_columns(["text", "num_tokens"])

data_collator = DataCollatorWithPadding(tokenizer, max_length=2048, padding=True)
ds = tokenized_dataset["train"].select(range(16))
print(ds[0])
train_loader = DataLoader(
    tokenized_dataset["train"],
    collate_fn=data_collator,
    shuffle=True,
    batch_size=4,
    pin_memory=True
)

In [None]:
sample_batch = [ds[i] for i in range(16)]

# Convert lists to PyTorch tensors before collation
for example in sample_batch:
    example["input_ids"] = torch.tensor(example["input_ids"], dtype=torch.long)
    example["attention_mask"] = torch.tensor(example["attention_mask"], dtype=torch.long)
    example["labels"] = torch.tensor(example["labels"], dtype=torch.long)

collated_batch = data_collator(sample_batch)
print(collated_batch)

# sample_batch = [tokenized_dataset["train"][i] for i in range(4)]  # Pick a few examples
# collated_batch = data_collator(sample_batch)
# print(collated_batch)
# print(next(iter(train_loader)))

In [None]:
# data['Context_length'] = tokenized_dataset.select_columns('num_tokens').apply(len)
# dataset_path = "/cmlscratch/astein0/efficient_tokenization_for_inference/datasets/test"
dataset_path = "/cmlscratch/astein0/efficient_tokenization_for_inference/datasets/new_tokenized"
# args.dataset

ds = datasets.load_from_disk(dataset_path)

# Split the dataset into train (90%) and validation (10%)
ds = ds.train_test_split(test_size=0.1)

plt.figure(figsize=(10, 3))
sns.histplot(ds["train"]['num_tokens'], bins=50, kde=True)
plt.title('Distribution of Context Lengths')
plt.xlabel('Length of Context')
plt.ylabel('Frequency')
plt.show()

In [None]:
## CHECKING NEW TOKENS
import datasets
batch_size = 1000
threads = 16
dataset_path = "/cmlscratch/astein0/efficient_tokenization_for_inference/datasets/new_tokenized"
# args.dataset

ds = datasets.load_from_disk(dataset_path)

def count_large_tokens(batch):
    # Count tokens > 1000 in each example's input_ids
    counts = [sum(1 for token_id in ids if token_id > 128000) for ids in batch['input_ids']]
    totals = [len(ids) for ids in batch['input_ids']]
    percents = [count/total for count, total in zip(counts, totals)]
    return {'large_token_count': counts, 'total_tokens': totals, 'percent_large_tokens': percents}

if "large_token_count" not in ds.column_names:
    # Apply the counting function to the dataset with batching
    dataset_with_counts = ds.map(
        count_large_tokens, 
        batched=True, 
        batch_size=batch_size, 
        num_proc=threads
    )

# You can then analyze the distribution
plt.figure(figsize=(10, 3))
# sns.histplot(dataset_with_counts['large_token_count'], bins=50, kde=True)
sns.histplot(dataset_with_counts['percent_large_tokens'], bins=50, kde=True)
plt.title('Distribution of Tokens with ID > 128000')
plt.xlabel('Count of Tokens > 128000')
plt.ylabel('Frequency')
plt.show()

# Print some statistics
print("Average tokens > 128000 per example:", np.mean(dataset_with_counts['large_token_count']))
print("Max tokens > 128000 in any example:", np.max(dataset_with_counts['large_token_count']))
print("Total tokens > 128000:", sum(dataset_with_counts['large_token_count']))


In [None]:
batch_size = 2
gradient_accumulation_steps = 8
processes = 8
print(len(ds["train"]) / batch_size / gradient_accumulation_steps / processes)


In [None]:
# filtered_data = tokenized_dataset[tokenized_dataset['num_tokens'] <= 500]
filtered_data = tokenized_dataset.filter(
    lambda batch: [num_tokens < 2000 for num_tokens in batch["num_tokens"]],
    batched=True, 
    batch_size=batch_size, 
    num_proc=threads
)

# ln_Context = filtered_data['num_tokens'].apply(len)
plt.figure(figsize=(10, 3))
sns.histplot(filtered_data['num_tokens'], bins=50, kde=True)
plt.title('Distribution of Context Lengths')
plt.xlabel('Length of Context')
plt.ylabel('Frequency')
plt.show()

In [None]:
max_seq_length = 2048
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name="unsloth/Llama-3.2-1B-bnb-4bit",
    max_seq_length=max_seq_length,
    load_in_4bit=True,
    dtype=None,
)

model = FastLanguageModel.get_peft_model(
    model,
    r=16,
    lora_alpha=16,
    lora_dropout=0,
    target_modules=["q_proj", "k_proj", "v_proj", "up_proj", "down_proj", "o_proj", "gate_proj"],
    use_rslora=True,
    use_gradient_checkpointing="unsloth",
    random_state = 32,
    loftq_config = None,
)
print(model.print_trainable_parameters())


EXTEND VOCAB

In [1]:
import torch
from liger_kernel.transformers import AutoLigerKernelForCausalLM
from efficient_tokenization.tokenize_simple import get_tokenizer, AutoTokenizer

model_name = "meta-llama/Llama-3.2-1B"
# tokenizer_path = "/cmlscratch/astein0/LLM-pretraining/LLM-pretraining-tokenization/tokenizers/Llama-3.2-tokenizer-genqa-math-empty-start/new_mergeable_ranks_2000.model"
tokenizer_path = "/cmlscratch/astein0/efficient_tokenization_for_inference/tokenizers/Llama-3.2-tokenizer-genqa-math-empty-start-1000"
tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)

model = AutoLigerKernelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.bfloat16,
    attn_implementation="sdpa",
    use_cache=False,  # Disable KV cache during training
    # device_map="auto"  # Let accelerate handle device mapping
)

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
new_tokenizer_path = "/cmlscratch/astein0/efficient_tokenization_for_inference/tokenizers/Llama-3.2-tokenizer-genqa-math-empty-start-1000"
print("Loading tokenizer...")
extended_tokenizer = AutoTokenizer.from_pretrained(new_tokenizer_path)
texts = ['Translate the following text into the same text but with fewer tokens: text1: <|user|> Find the area of a trapezoid with bases of 8 cm and 12 cm and an altitude of 6 cm. Both bases are extended by 3 cm to form a new trapezoid. Find the area of the new trapezoid. Express your answer in simplified form.<|assistant|> The area of the original trapezoid is:\n\n```\nArea = (8 cm + 12 cm) * 6 cm / 2 = 60 cm²\n```\n\nThe ratio of the bases of the new trapezoid to the old trapezoid is:\n\n```\nRatio = (8 cm + 3 cm + 12 cm + 3 cm) / (8 cm + 12 cm) = 1\n```\n\nSince the bases are in the same ratio, the areas of the trapezoids will also be in the same ratio, so the area of the new trapezoid is:\n\n```\nNew Area = 60 cm² * 1 = 60 cm²\n```<|user|> If the original trapezoid is partitioned into two congruent right triangles by the altitude, what is the area of each triangle?<|assistant|> The area of each triangle is:\n\n```\nTriangle Area = (8 cm + 12 cm) * 6 cm / 2 / 2 = 30 cm²\n```text2: <|user|> Find the area of a trapezoid with bases of 8 cm and 12 cm and an altitude of 6 cm. Both bases are extended by 3 cm to form a new trapezoid. Find the area of the new trapezoid. Express your answer in simplified form.<|assistant|> The area of the original trapezoid is:\n\n```\nArea = (8 cm + 12 cm) * 6 cm / 2 = 60 cm²\n```\n\nThe ratio of the bases of the new trapezoid to the old trapezoid is:\n\n```\nRatio = (8 cm + 3 cm + 12 cm + 3 cm) / (8 cm + 12 cm) = 1\n```\n\nSince the bases are in the same ratio, the areas of the trapezoids will also be in the same ratio, so the area of the new trapezoid is:\n\n```\nNew Area = 60 cm² * 1 = 60 cm²\n```<|user|> If the original trapezoid is partitioned into two congruent right triangles by the altitude, what is the area of each triangle?<|assistant|> The area of each triangle is:\n\n```\nTriangle Area = (8 cm + 12 cm) * 6 cm / 2 / 2 = 30 cm²\n```', "Translate the following text into the same text but with fewer tokens: text1: <|user|> Consider the differential equation:\n\n```y'' + y' - 2y = e^-x```\n\nSolve this equation using the method of undetermined coefficients.<|assistant|> Step 1: Find the Solution to the Homogeneous Equation \n\nThe homogeneous equation is:\n\n```y'' + y' - 2y = 0```\n\nIts characteristic equation is:\n\n```r^2 + r - 2 = 0```\n\nSolving for the roots, we get:\n\n```r = 1 ± √3i```\n\nTherefore, the solution to the homogeneous equation is:\n\n```y_h(x) = e^x (c_1 cos √3 x + c_2 sin √3 x)```\n\n Step 2: Find a Particular Solution to the Non-Homogeneous Equation \n\nSince the non-homogeneous term is e^-x, we guess a particular solution of the form:\n\n```y_p(x) = Ae^-x```\n\nDifferentiating twice, we get:\n\n```y_p'(x) = -Ae^-x```\n\n```y_p''(x) = Ae^-x```\n\nSubstituting these into the non-homogeneous equation, we get:\n\n```Ae^-x - Ae^-x - 2Ae^-x = e^-x```\n\nSolving for A, we get:\n\n```A = 1/2```\n\nTherefore, the particular solution is:\n\n```y_p(x) = (1/2)e^-x```\n\n Step 3: Combine the Homogeneous and Particular Solutions \n\nThe general solution to the non-homogeneous equation is:\n\n```y(x) = y_h(x) + y_p(x)```\n\n```y(x) = e^x (c_1 cos √3 x + c_2 sin √3 x) + (1/2)e^-x```<|user|> Find the general solution to the following differential equation:\n\n```y''' - 3y'' + 2y' - y = 0```<|assistant|> Step 1: Find the Solution to the Homogeneous Equation \n\nThe homogeneous equation is:\n\n```y''' - 3y'' + 2y' - y = 0```\n\nIts characteristic equation is:\n\n```r^3 - 3r^2 + 2r - 1 = 0```\n\nFactoring, we get:\n\n```(r - 1)^2 (r - 1) = 0```\n\nTherefore, the roots are:\n\n```r = 1, 1, 1```\n\nTherefore, the solution to the homogeneous equation is:\n\n```y_h(x) = c_1 e^x + c_2 x e^x + c_3 x^2 e^x```\n\n Step 2: Find a Particular Solution to the Non-Homogeneous Equation \n\nSince the non-homogeneous term is 0, the particular solution is:\n\n```y_p(x) = 0```\n\n Step 3: Combine the Homogeneous and Particular Solutions \n\nThe general solution to the non-homogeneous equation is:\n\n```y(x) = y_h(x) + y_p(x)```\n\n```y(x) = c_1 e^x + c_2 x e^x + c_3 x^2 e^x```text2: <|user|> Consider the differential equation:\n\n```y'' + y' - 2y = e^-x```\n\nSolve this equation using the method of undetermined coefficients.<|assistant|> Step 1: Find the Solution to the Homogeneous Equation \n\nThe homogeneous equation is:\n\n```y'' + y' - 2y = 0```\n\nIts characteristic equation is:\n\n```r^2 + r - 2 = 0```\n\nSolving for the roots, we get:\n\n```r = 1 ± √3i```\n\nTherefore, the solution to the homogeneous equation is:\n\n```y_h(x) = e^x (c_1 cos √3 x + c_2 sin √3 x)```\n\n Step 2: Find a Particular Solution to the Non-Homogeneous Equation \n\nSince the non-homogeneous term is e^-x, we guess a particular solution of the form:\n\n```y_p(x) = Ae^-x```\n\nDifferentiating twice, we get:\n\n```y_p'(x) = -Ae^-x```\n\n```y_p''(x) = Ae^-x```\n\nSubstituting these into the non-homogeneous equation, we get:\n\n```Ae^-x - Ae^-x - 2Ae^-x = e^-x```\n\nSolving for A, we get:\n\n```A = 1/2```\n\nTherefore, the particular solution is:\n\n```y_p(x) = (1/2)e^-x```\n\n Step 3: Combine the Homogeneous and Particular Solutions \n\nThe general solution to the non-homogeneous equation is:\n\n```y(x) = y_h(x) + y_p(x)```\n\n```y(x) = e^x (c_1 cos √3 x + c_2 sin √3 x) + (1/2)e^-x```<|user|> Find the general solution to the following differential equation:\n\n```y''' - 3y'' + 2y' - y = 0```<|assistant|> Step 1: Find the Solution to the Homogeneous Equation \n\nThe homogeneous equation is:\n\n```y''' - 3y'' + 2y' - y = 0```\n\nIts characteristic equation is:\n\n```r^3 - 3r^2 + 2r - 1 = 0```\n\nFactoring, we get:\n\n```(r - 1)^2 (r - 1) = 0```\n\nTherefore, the roots are:\n\n```r = 1, 1, 1```\n\nTherefore, the solution to the homogeneous equation is:\n\n```y_h(x) = c_1 e^x + c_2 x e^x + c_3 x^2 e^x```\n\n Step 2: Find a Particular Solution to the Non-Homogeneous Equation \n\nSince the non-homogeneous term is 0, the particular solution is:\n\n```y_p(x) = 0```\n\n Step 3: Combine the Homogeneous and Particular Solutions \n\nThe general solution to the non-homogeneous equation is:\n\n```y(x) = y_h(x) + y_p(x)```\n\n```y(x) = c_1 e^x + c_2 x e^x + c_3 x^2 e^x```"]

sample = extended_tokenizer(texts, add_special_tokens=False)
ids_list = torch.tensor(sample["input_ids"][0])
print(ids_list)
model(ids_list)

In [2]:
import efficient_tokenization.extend_embeddings as extend_embeddings

import importlib
importlib.reload(extend_embeddings)
import json

from finetune import my_custom_forward

# tokenizer_json = json.loads(tokenizer._tokenizer.to_str())

# merge_list = tokenizer_json["model"]["merges"]
# print(merge_list[0])
# print(len(merge_list))
embedding_init_strategy = "merge"
new_vocab_size = len(tokenizer)
original_vocab_size = model.config.vocab_size
num_new_tokens = new_vocab_size - original_vocab_size
print(f"num_new_tokens: {num_new_tokens}")
print(f"len(tokenizer): {new_vocab_size}")

print(tokenizer._tokenizer.id_to_token(128260))

tokenizer.convert_ids_to_tokens(128000)

# Extend model embeddings
print(f"Extending model embeddings with strategy: {embedding_init_strategy}")
model = extend_embeddings.extend_model_embeddings(
    model, 
    num_new_tokens, 
    init_strategy=embedding_init_strategy,
    tokenizer=tokenizer
)

# model.forward = my_custom_forward.__get__(model, type(model))




num_new_tokens: 1000
len(tokenizer): 129256
```Ċ
Extending model embeddings with strategy: merge
The OrderedVocab you are attempting to save contains holes for indices [128000, 128001, 128002, 128003, 128004, 128005, 128006, 128007, 128008, 128009, 128010, 128011, 128012, 128013, 128014, 128015, 128016, 128017, 128018, 128019, 128020, 128021, 128022, 128023, 128024, 128025, 128026, 128027, 128028, 128029, 128030, 128031, 128032, 128033, 128034, 128035, 128036, 128037, 128038, 128039, 128040, 128041, 128042, 128043, 128044, 128045, 128046, 128047, 128048, 128049, 128050, 128051, 128052, 128053, 128054, 128055, 128056, 128057, 128058, 128059, 128060, 128061, 128062, 128063, 128064, 128065, 128066, 128067, 128068, 128069, 128070, 128071, 128072, 128073, 128074, 128075, 128076, 128077, 128078, 128079, 128080, 128081, 128082, 128083, 128084, 128085, 128086, 128087, 128088, 128089, 128090, 128091, 128092, 128093, 128094, 128095, 128096, 128097, 128098, 128099, 128100, 128101, 128102, 128103,

The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`


In [3]:
import datasets
from finetune import MyPaddingCollatorWithLossMask
from torch.utils.data import DataLoader

dataset_path = "/cmlscratch/astein0/efficient_tokenization_for_inference/datasets/translation_tokenized"

ds = datasets.load_from_disk(dataset_path)
dl = DataLoader(ds.select(range(10)), batch_size=1, shuffle=True, collate_fn=MyPaddingCollatorWithLossMask(tokenizer=tokenizer))

batch = next(iter(dl))
print(batch)

{'input_ids': tensor([[128000,   2675,    527,    264,  11190,  15592,  18328,   1664,   5553,
            291,    304,  40916,    279,   1984,   1203,    311,   1124,    304,
            459,  13890,    719,    810,  11297,   1648,     13,    578,   4113,
           1984,    690,    387,  16717,    555,    364,   1342,    311,  13454,
           4989,    323,    842,    449,    364,    408,   1495,   3238,    220,
            578,  13454,   3857,    690,    387,  16717,    555,    364,  31724,
          92188,   1342,    311,  13454,     25,  83739,    882,     91,     29,
            763,    264,  12960,    449,   4219,    507,     11,    264,  44321,
          14469,    374,  77933,    311,    279,  23899,   6812,    520,   1486,
            423,     13,   1442,   9827,    284,    220,     19,  10166,    323,
          11162,    284,    220,     21,  10166,     11,   1505,    279,  10801,
            315,    279,  12960,  16134,     91,  78191,     91,     29,  15166,
            22

In [4]:
model.to("cuda")
batch = {k: v.to(model.device) for k, v in batch.items()}
model.train()

print(model.device)
print(model)
for b, t in batch.items():
    print(b, t.shape, t.device)
    

cuda:0
LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(129256, 2048)
    (layers): ModuleList(
      (0-15): 16 x LlamaDecoderLayer(
        (self_attn): LlamaSdpaAttention(
          (q_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (k_proj): Linear(in_features=2048, out_features=512, bias=False)
          (v_proj): Linear(in_features=2048, out_features=512, bias=False)
          (o_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LigerSwiGLUMLP(
          (gate_proj): Linear(in_features=2048, out_features=8192, bias=False)
          (up_proj): Linear(in_features=2048, out_features=8192, bias=False)
          (down_proj): Linear(in_features=8192, out_features=2048, bias=False)
        )
        (input_layernorm): LigerRMSNorm((2048,), eps=1e-05, offset=0.0, in_place=True)
        (post_attention_layernorm): LigerRMSNorm((2048,), eps=1e-05, offset=0.0, 

In [6]:
# first do the loss on everything
input_ids = batch["input_ids"]
labels = batch["labels"]
loss_mask = batch["loss_mask"]
attention_mask = batch["attention_mask"]

outputs = model(
    input_ids=input_ids,
    attention_mask=attention_mask,
    labels=labels,
    use_cache=False,
    # num_items_in_batch=num_items_in_batch,
    # new_token_start_index=original_vocab_size
)
# gets logits
loss = outputs.loss
print(loss)
# loss.backward()

with torch.no_grad():
    # Loss only on some of the tokens
    masked_labels = labels.clone()
    masked_labels[loss_mask==0] = -100

    outputs = model(
        input_ids=input_ids,
        attention_mask=attention_mask,
        labels=masked_labels,
        use_cache=False,
    )
    loss = outputs.loss
    print(loss)
    # loss.backward()

    # then do the loss on only the tokens from the new tokenizer (as masked)

    new_labels = labels.clone()
    new_labels[new_labels > original_vocab_size] = -100

    outputs = model(
        input_ids=input_ids,
        attention_mask=attention_mask,
        labels=new_labels,
        use_cache=False,
    )
    loss = outputs.loss
    print(loss)
# loss.backward()

tensor(2.0396, device='cuda:0',
       grad_fn=<LigerFusedLinearCrossEntropyFunctionBackward>)
tensor(2.3922, device='cuda:0')
tensor(1.7377, device='cuda:0')


In [7]:
original_vocab_size

129256

In [10]:
outputs

CausalLMOutputWithPast(loss=tensor(0.8905, device='cuda:0',
       grad_fn=<LigerFusedLinearCrossEntropyFunctionBackward>), logits=None, past_key_values=None, hidden_states=None, attentions=None)

In [None]:
# new_embeddings_list = extend_embeddings.get_new_embeddings(model, num_new_tokens)
# new_embeddings_list[0]

embeddings_output = model.get_output_embeddings()
print(f"embeddings_output: {embeddings_output}")
print(f"embeddings_output.weight: {embeddings_output.weight}")
print(f"embeddings_output.weight.data: {embeddings_output.weight.data}")
print(f"embeddings_output.weight.grad: {embeddings_output.weight.grad}")

params = extend_embeddings.get_new_embedding_params(model, num_new_tokens)
this_param = params[0]
this_param.retain_grad()
print(f"params: {params}, length: {len(params)}")
print(f"this_param: {this_param}, shape: {this_param.shape}")
print(f"params.data: {this_param.data}, shape: {this_param.data.shape}")
print(f"params.grad: {this_param.grad}, shape: {this_param.grad.shape if this_param.grad is not None else 'None'}")

grads = extend_embeddings.get_new_embeddings_grads(model, num_new_tokens)[0]
print(f"grads: {grads}, shape: {grads.shape}")


In [None]:
import efficient_tokenization.tokenize_simple as tokenize_simple
from efficient_tokenization.tokenize_simple import get_tokenizer

import transformers

import importlib
importlib.reload(tokenize_simple)
importlib.reload(transformers)

model_name = "meta-llama/Llama-3.2-1B"
tokenizer_path = "/cmlscratch/astein0/LLM-pretraining/LLM-pretraining-tokenization/tokenizers/Llama-3.2-tokenizer-genqa-math-empty-start/new_mergeable_ranks_2000.model"
base_tokenizer = transformers.AutoTokenizer.from_pretrained(model_name)

tokenizer = get_tokenizer(tokenizer_path, old_tokenizer=base_tokenizer)

In [None]:
len(tokenizer.get_vocab())

In [15]:
def bytes_to_unicode():
    """
    Returns list of utf-8 byte and a mapping to unicode strings. We specifically avoids mapping to whitespace/control
    characters the bpe code barfs on.
    The reversible bpe codes work on unicode strings. This means you need a large # of unicode characters in your vocab
    if you want to avoid UNKs. When you're at something like a 10B token dataset you end up needing around 5K for
    decent coverage. This is a significant percentage of your normal, say, 32K bpe vocab. To avoid that, we want lookup
    tables between utf-8 bytes and unicode strings.
    """
    bs = (
        list(range(ord("!"), ord("~") + 1)) + list(range(ord("¡"), ord("¬") + 1)) + list(range(ord("®"), ord("ÿ") + 1))
    )
    cs = bs[:]
    n = 0
    for b in range(2**8):
        if b not in bs:
            bs.append(b)
            cs.append(2**8 + n)
            n += 1
    cs = [chr(n) for n in cs]
    return dict(zip(bs, cs))

byte_encoder = bytes_to_unicode()

def token_bytes_to_string(b):
    return "".join([byte_encoder[ord(char)] for char in b.decode("latin-1")])

def unicode_to_bytes():
    """
    Returns a mapping from unicode strings back to their original utf-8 bytes.
    This reverses the `bytes_to_unicode` mapping.
    """
    # byte_encoder = bytes_to_unicode()  # Original byte-to-unicode mapping
    return {v: k for k, v in byte_encoder.items()}

byte_decoder = unicode_to_bytes()

def string_to_token_bytes(s):
    """
    Converts a string back into token bytes using the reverse mapping.

    Args:
        s (str): The input string to convert.

    Returns:
        bytes: The byte representation of the string.
    """
    return bytes([byte_decoder[char] for char in s])




In [None]:
import base64
# mergeable_ranks = read_tokenizer_from_model(base_tokenizer_path)
sorted_vocab = {k: v for k, v in sorted(llama_tokenizer.vocab.items(), key=lambda item: item[1])}
for tok, i in sorted_vocab.items():
    my_bytes = string_to_token_bytes(tok)
    my_string = token_bytes_to_string(my_bytes)
    if i > 127988:
        print(f"{i:06d}: {tok}, {my_bytes}, {base64.b64encode(my_bytes)}, {base64.b64decode(tok)}")
    

In [18]:
from train_tokenizer import read_tokenizer_from_model
# old_path = "/cmlscratch/astein0/LLM-pretraining/LLM-pretraining-tokenization/tokenizers/Llama-3.2-tokenizer/tokenizer.model"
old_path = "/cmlscratch/astein0/LLM-pretraining/LLM-pretraining-tokenization/tokenizers/Llama-3.2-tokenizer-genqa-math-empty-start/new_mergeable_ranks_2000.model"
old_vocab = read_tokenizer_from_model(old_path)

joined_tokenizer = old_vocab.copy()
joined_tokenizer.update(llama_tokenizer.get_added_vocab())


In [None]:
for tok, i in old_vocab.items():
    my_bytes = string_to_token_bytes(tok)
    if i > 127988:
        print(f"{i:06d}: {tok}, {my_bytes}, {base64.b64encode(my_bytes)}")
    

In [None]:
from typing import Dict
def compare_dicts(dict1: Dict, dict2: Dict) -> bool:
    """
    Compare two dictionaries and print differences if they exist.
    
    Args:
        dict1: First dictionary
        dict2: Second dictionary
        
    Returns:
        bool: True if dictionaries are identical, False otherwise
    """
    if dict1.keys() != dict2.keys():
        print("Different keys:")
        print("Keys only in first dict:", set(dict1.keys()) - set(dict2.keys()))
        print("Keys only in second dict:", set(dict2.keys()) - set(dict1.keys()))
        return False
    
    differences = {
        k: (dict1[k], dict2[k])
        for k in dict1
        if dict1[k] != dict2[k]
    }
    
    if differences:
        print("Different values:")
        for k, (v1, v2) in differences.items():
            print(f"Key: {k}")
            print(f"  Dict1: {v1}")
            print(f"  Dict2: {v2}")
        return False
        
    return True

are_same = compare_dicts(sorted_vocab, joined_tokenizer)
print("Dictionaries are identical:", are_same)

In [None]:
import lm_eval 
importlib.reload(lm_eval)

model_args_dict = {
    "pretrained": model,  # This will be your model object directly
    "tokenizer": tokenizer,  # This will be your tokenizer object directly
    "old_tokenizer": base_tokenizer,  # This will be your tokenizer object directly
    # "parallelize": True,
    "do_sample": True,
    "temperature": 0.7,
    "top_p": 3,
    "trust_remote_code": True
}

LM_model = lm_eval.models.huggingface.HFLM(**model_args_dict)
