In [1]:
from pprint import pprint
from parsers import ModelParser

import torch
from transformers import AutoTokenizer

model_parser = ModelParser([
    "../Meta-Llama-3-8B/model-00001-of-00004.safetensors",
])

In [2]:
pprint(model_parser.tensor_names)

['model.embed_tokens.weight',
 'model.layers.0.input_layernorm.weight',
 'model.layers.0.mlp.down_proj.weight',
 'model.layers.0.mlp.gate_proj.weight',
 'model.layers.0.mlp.up_proj.weight',
 'model.layers.0.post_attention_layernorm.weight',
 'model.layers.0.self_attn.k_proj.weight',
 'model.layers.0.self_attn.o_proj.weight',
 'model.layers.0.self_attn.q_proj.weight',
 'model.layers.0.self_attn.v_proj.weight',
 'model.layers.1.input_layernorm.weight',
 'model.layers.1.mlp.down_proj.weight',
 'model.layers.1.mlp.gate_proj.weight',
 'model.layers.1.mlp.up_proj.weight',
 'model.layers.1.post_attention_layernorm.weight',
 'model.layers.1.self_attn.k_proj.weight',
 'model.layers.1.self_attn.o_proj.weight',
 'model.layers.1.self_attn.q_proj.weight',
 'model.layers.1.self_attn.v_proj.weight',
 'model.layers.2.input_layernorm.weight',
 'model.layers.2.mlp.down_proj.weight',
 'model.layers.2.mlp.gate_proj.weight',
 'model.layers.2.mlp.up_proj.weight',
 'model.layers.2.post_attention_layernorm.we

## Prepare text and embeddings

In [3]:
tokenizer = AutoTokenizer.from_pretrained("../Meta-Llama-3-8B/")

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [4]:
messages = [
    {"role": "system", "content": "You are a pirate chatbot who always responds in pirate speak!"},
    {"role": "user", "content": "Who are you?"},
]

input_ids = tokenizer.apply_chat_template(
    messages,
    add_generation_prompt=True,
    return_tensors="pt"
)#.to(model.device)

terminators = [
    tokenizer.eos_token_id,
    tokenizer.convert_tokens_to_ids("<|eot_id|>")
]


No chat template is defined for this tokenizer - using a default chat template that implements the ChatML format (without BOS/EOS tokens!). If the default is not appropriate for your model, please set `tokenizer.chat_template` to an appropriate template. See https://huggingface.co/docs/transformers/main/chat_templating for more information.



In [5]:
input_ids

tensor([[   27,    91,   318,  5011,    91,    29,  9125,   198,  2675,   527,
           264, 55066,  6369,  6465,   889,  2744, 31680,   304, 55066,  6604,
         88032,    91,   318,  6345,    91,   397,    27,    91,   318,  5011,
            91,    29,   882,   198, 15546,   527,   499, 76514,    91,   318,
          6345,    91,   397,    27,    91,   318,  5011,    91,    29, 78191,
           198]])

## Forward passes

In [6]:
# load testing data
import pickle
with open("intermediate_data_llama3_8b.pkl", "rb") as f:
    data_orig = pickle.load(f)

In [7]:
from transformer_ops import embedding_matrix, RMSNorm

In [10]:
model_embed_tokens_weight = model_parser.get_tensor('model.embed_tokens.weight')
input_embeddings = embedding_matrix(inputs=input_ids, weights=model_embed_tokens_weight)

In [12]:
model_layers_0_input_layernorm = RMSNorm()
model_layers_0_post_attention_layernorm = RMSNorm()

In [14]:
model_layers_0_input_layernorm_weight = model_parser.get_tensor('model.layers.0.input_layernorm.weight')
model_layers_0_post_attention_layernorm_weight = model_parser.get_tensor('model.layers.0.post_attention_layernorm.weight')

In [9]:
input_embeddings = data_orig["inputs"]["model.layers.0.input_layernorm"][0]

In [22]:
torch.allclose(
    model_layers_0_input_layernorm.forward(inputs=data_orig["inputs"]["model.layers.0.input_layernorm"][0], weights=model_layers_0_input_layernorm_weight),
    data_orig["outputs"]["model.layers.0.input_layernorm"]
)

True

In [23]:
torch.allclose(
    model_layers_0_post_attention_layernorm.forward(inputs=data_orig["inputs"]["model.layers.0.post_attention_layernorm"][0], weights=model_layers_0_post_attention_layernorm_weight),
    data_orig["outputs"]["model.layers.0.post_attention_layernorm"]
)

True

In [26]:
config = {
  "architectures": [
    "LlamaForCausalLM"
  ],
  "attention_bias": False,
  "attention_dropout": 0.0,
  "bos_token_id": 128000,
  "eos_token_id": 128001,
  "hidden_act": "silu",
  "hidden_size": 4096,
  "initializer_range": 0.02,
  "intermediate_size": 14336,
  "max_position_embeddings": 8192,
  "model_type": "llama",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "num_key_value_heads": 8,
  "pretraining_tp": 1,
  "rms_norm_eps": 1e-05,
  "rope_scaling": None,
  "rope_theta": 500000.0,
  "tie_word_embeddings": False,
  "torch_dtype": "bfloat16",
  "transformers_version": "4.40.0.dev0",
  "use_cache": True,
  "vocab_size": 128256
}

def precompute_rope_constants(dim: int, end: int, theta: float = 10000.0):
    """
    RoPE:
        - https://blog.eleuther.ai/rotary-embeddings/
        - https://github.com/meta-llama/llama3/blob/main/llama/model.py
    """ 
    freqs = 1.0 / (theta ** (torch.arange(0, dim, 2)[: (dim // 2)].float() / dim))
    t = torch.arange(end, device=freqs.device, dtype=torch.float32)
    freqs = torch.outer(t, freqs)
    freqs_cis = torch.polar(torch.ones_like(freqs), freqs)  # complex64
    return freqs_cis

In [30]:
freqs_cis = precompute_rope_constants(
    config["hidden_size"] // config["num_attention_heads"],
    config["max_position_embeddings"] * 2,
    config["rope_theta"],
)

In [32]:
freqs_cis.shape

torch.Size([16384, 64])