# safetensors conversion

Llama tensors are serialized in `safetensors` format. For Llama 3 1B model, the tensors are encoded in `bfloat16`.

`bfloat16` cannot be used on CPU, we need to convert them to `float32`. `bfloat16`s are just tuncated `float32`s : the lower bits of the exponent are truncated. We will simply pad each `bfloat16` with two `0` bytes and cast the result to `float32`.

In [22]:
!pip install numpy



In [23]:
required_files = [
    "https://huggingface.co/meta-llama/Llama-3.2-1B/resolve/main/model.safetensors"
]


import urllib.request
from pathlib import Path

for required_file in required_files:
    local_file = required_file.split("/")[-1]
    if not Path(local_file).exists():
        with open("HF_TOKEN") as f:
            HF_TOKEN = f.read()

        display(f"Downloading {local_file}")

        opener = urllib.request.build_opener()
        opener.addheaders = [("Authorization", f"Bearer {HF_TOKEN}")]
        urllib.request.install_opener(opener)

        urllib.request.urlretrieve(required_file, local_file)

In [24]:
def bfloat16_to_float32(bf16buffer: bytes) -> bytes:
    assert len(bf16buffer) % 2 == 0, "the bfloat16 buffer should have an even number of bytes"
    # bfloat16 are exponent-truncated float32s. just add zeros to convert them to float32
    result = bytearray(len(bf16buffer) * 2)
    for i in range(0, len(bf16buffer), 2):
        # endianness: little-endian
        # BF16: 2 bytes : [ B0, B1 ]
        # Float32: 4bytes : [0, 0, B0, B1]
        result[i * 2 + 2: i * 2 + 4] = bf16buffer[i : i + 2]
    return result
    

In [25]:
import mmap
import struct
import json
import numpy as np

model_dir = Path("model")
if not model_dir.exists():
    model_dir.mkdir(parents=True)
    with open("model.safetensors", mode="rb") as file:
        # See safetensors file format : https://github.com/huggingface/safetensors
        mmaped = mmap.mmap(file.fileno(), 0, prot=mmap.PROT_READ)
        header_size = mmaped[:8]
        header_size = struct.unpack("<Q", header_size)[0] # little-endinan uint64
        header = mmaped[8 : 8 + header_size]
        header = json.loads(header)
        for tensor_name, tensor_metadata in header.items():
            if tensor_name == "__metadata__":
                continue

            start, end = tensor_metadata["data_offsets"]

            # header size should must be taken into account:
            start += 8 + header_size
            end += 8 + header_size

            dtype = tensor_metadata["dtype"]
            print(f"Extracting {tensor_name}, {start}, {end}")
            if dtype == "BF16":
                raw_tensor = bfloat16_to_float32(mmaped[start: end])
                with open(model_dir / f"{tensor_name}.raw", mode="wb") as file:
                    file.write(raw_tensor)
        with open(model_dir / "metadata.json", mode="w") as file:
            json.dump(header, file)


Extracting model.embed_tokens.weight, 16808, 525353384
Extracting model.layers.0.input_layernorm.weight, 525353384, 525357480
Extracting model.layers.0.mlp.down_proj.weight, 525357480, 558911912
Extracting model.layers.0.mlp.gate_proj.weight, 558911912, 592466344
Extracting model.layers.0.mlp.up_proj.weight, 592466344, 626020776
Extracting model.layers.0.post_attention_layernorm.weight, 626020776, 626024872
Extracting model.layers.0.self_attn.k_proj.weight, 626024872, 628122024
Extracting model.layers.0.self_attn.o_proj.weight, 628122024, 636510632
Extracting model.layers.0.self_attn.q_proj.weight, 636510632, 644899240
Extracting model.layers.0.self_attn.v_proj.weight, 644899240, 646996392
Extracting model.layers.1.input_layernorm.weight, 646996392, 647000488
Extracting model.layers.1.mlp.down_proj.weight, 647000488, 680554920
Extracting model.layers.1.mlp.gate_proj.weight, 680554920, 714109352
Extracting model.layers.1.mlp.up_proj.weight, 714109352, 747663784
Extracting model.layers.1

In [5]:
def load_raw_model(path: str):
    model_dir = Path(path)
    model = {}
    with open(model_dir / "metadata.json") as file:
        metadata = json.load(file)
    for tensor_name, tensor_metadata in metadata.items():
        if tensor_name == "__metadata__":
            continue
        file = open(model_dir / f"{tensor_name}.raw", mode="rb")
        mmaped = mmap.mmap(file.fileno(), 0, prot=mmap.PROT_READ)
        model[tensor_name] = np.frombuffer(mmaped, dtype=np.float32).reshape(tensor_metadata["shape"])
    return model
    

In [6]:
model = load_raw_model("model")

In [7]:
model["model.embed_tokens.weight"].shape

(128256, 2048)

In [8]:
model.keys()

dict_keys(['model.embed_tokens.weight', 'model.layers.0.input_layernorm.weight', 'model.layers.0.mlp.down_proj.weight', 'model.layers.0.mlp.gate_proj.weight', 'model.layers.0.mlp.up_proj.weight', 'model.layers.0.post_attention_layernorm.weight', 'model.layers.0.self_attn.k_proj.weight', 'model.layers.0.self_attn.o_proj.weight', 'model.layers.0.self_attn.q_proj.weight', 'model.layers.0.self_attn.v_proj.weight', 'model.layers.1.input_layernorm.weight', 'model.layers.1.mlp.down_proj.weight', 'model.layers.1.mlp.gate_proj.weight', 'model.layers.1.mlp.up_proj.weight', 'model.layers.1.post_attention_layernorm.weight', 'model.layers.1.self_attn.k_proj.weight', 'model.layers.1.self_attn.o_proj.weight', 'model.layers.1.self_attn.q_proj.weight', 'model.layers.1.self_attn.v_proj.weight', 'model.layers.10.input_layernorm.weight', 'model.layers.10.mlp.down_proj.weight', 'model.layers.10.mlp.gate_proj.weight', 'model.layers.10.mlp.up_proj.weight', 'model.layers.10.post_attention_layernorm.weight', '

In [10]:
t = model["model.layers.0.input_layernorm.weight"]

In [20]:
t.transpose().shape

(2048,)