In [None]:
# local use the latest by pip install .
# the indent of local jupyter and remote colab is different
!pip install gguf

In [None]:
import torch
import numpy as np
import gguf
from gguf import GGUFReader
import os
import re
import subprocess
from safetensors import safe_open

torch.set_grad_enabled(False)

## Dequant

<details>
<summary>permute and inverse_permute demo</summary>

https://github.com/huggingface/transformers/blob/main/src/transformers/models/llama/convert_llama_weights_to_hf.py#L222

```python
def permute(w, n_heads, dim1, dim2):
    return w.view(n_heads, dim1 // n_heads // 2, 2, dim2).transpose(1, 2).reshape(dim1, dim2)

def inverse_permute(w, n_heads, dim1, dim2):
    w = w.view(n_heads, 2, dim1 // n_heads // 2, dim2)
    w = w.transpose(1, 2)
    w = w.reshape(dim1, dim2)
    return w

n_heads = 2
dim1 = 12
dim2 = 12
w= torch.arange(dim1 * dim2).view(dim1, dim2)
permuted_w = permute(w, n_heads, dim1, dim2)
print(w)
print(permuted_w)
w = inverse_permute(permuted_w, n_heads, dim1, dim2)
print(w)
```

</details>

In [None]:
# awq and gptq share the same pytorch name 
name_map = {
    'attn_q': 'self_attn.q',
    'attn_k': 'self_attn.k',
    'attn_v': 'self_attn.v',
    'attn_output': 'self_attn.o',
    'ffn_down': 'mlp.down',
    'ffn_gate': 'mlp.gate',
    'ffn_up': 'mlp.up',
    'attn_norm': 'input_layernorm',
    'ffn_norm': 'post_attention_layernorm'
}

def pt_get_tensor(reader, prefix, name):
    if name == 'token_embd.weight':
        return reader.get_tensor('model.embed_tokens.weight')
    for k in ['attn_norm', 'ffn_norm']:
        if k in name:
            return reader.get_tensor(prefix+name_map[k]+'.weight')
    return None

# permute for sliced rotary
def permute(w, n_heads, dim1, dim2):
    return w.view(n_heads, dim1 // n_heads // 2, 2, dim2).transpose(1, 2).reshape(dim1, dim2)

# inverse permute for sliced rotary
def inverse_permute(name, w):
    if 'attn_q' in name:
        dim3 = n_heads
    elif 'attn_k' in name:
        dim3 = n_kv_heads
    else:
        return w
        
    dim1, dim2 = w.shape
    return w.view(dim3, 2, dim1 // dim3 // 2, dim2).transpose(1, 2).reshape(dim1, dim2)

def load_fakequant_tensor(reader, name):
    layer = name.split('.')[1] # f'blk.{layer}.xxx'
    prefix = f'model.layers.{layer}.'
    tensor = pt_get_tensor(reader, prefix, name)
    if tensor is not None:
        return tensor
    for k in name_map:
        if k in name:
            pt_name = prefix+name_map[k]+ '_proj.weight'
            break
    return inverse_permute(name, reader.get_tensor(pt_name).float())

https://github.com/mit-han-lab/llm-awq/blob/main/awq/quantize/quantizer.py

<details>
<summary>awq quant logic</summary>

[Question about the zero point](https://github.com/mit-han-lab/llm-awq/issues/116)

I noticed that only negative minimum values are preserved as zero points with the code.

```python
    if zero_point:
        max_val = w.amax(dim=1, keepdim=True)
        min_val = w.amin(dim=1, keepdim=True)
        max_int = 2**n_bit - 1
        min_int = 0
        scales = (max_val - min_val).clamp(min=1e-5) / max_int
        zeros = (-torch.round(min_val / scales)).clamp_(min_int, max_int)
```

Then, why not preserve all the minimum values including the positive values?
</details>

In [None]:
# https://medium.com/@crclq2018/awq-how-its-code-works-1ea92fb80bd2
def load_awq_tensor(reader, name):
    layer = name.split('.')[1] # f'blk.{layer}.xxx'
    prefix = f'model.layers.{layer}.'
    
    tensor = pt_get_tensor(reader, prefix, name)
    if tensor is not None:
        return tensor
    
    for k in name_map:
        if k in name:
            pt_name = prefix+name_map[k]+ '_proj.'
            break

    qweight = reader.get_tensor(pt_name+'qweight')
    qzeros  = reader.get_tensor(pt_name+'qzeros')
    scales  = reader.get_tensor(pt_name+'scales')

    # dequantize
    group_size = 128
    wf = torch.tensor([x * 4 for x in [0, 4, 1, 5, 2, 6, 3, 7]], dtype=torch.int32).unsqueeze(0)    
    zeros = torch.bitwise_right_shift(torch.unsqueeze(qzeros, 2).expand(-1, -1, 8), wf.unsqueeze(0)).to(torch.int8)
    zeros = torch.bitwise_and(zeros, 0xf)
    zeros = zeros.reshape(-1, 1, zeros.shape[1] * zeros.shape[2])
    
    weight = torch.bitwise_right_shift(torch.unsqueeze(qweight, 2).expand(-1, -1, 8), wf.unsqueeze(0)).to(torch.int8)
    weight = torch.bitwise_and(weight, 0xf)
    weight = weight.reshape(-1, group_size, weight.shape[1] * weight.shape[2])
    
    scales = scales.reshape(-1, 1, scales.shape[-1])
    weight = scales * (weight - zeros) # by intristic broadcast
    weight = weight.reshape(weight.shape[0] * weight.shape[1], weight.shape[2])

    return inverse_permute(name, weight.float().T)

https://github.com/qwopqwop200/GPTQ-for-LLaMa/blob/triton/gptq.py

https://github.com/qwopqwop200/GPTQ-for-LLaMa/blob/triton/quant/quantizer.py

<details>
<summary>gptq quant logic</summary>


```python

# if actorder:
H = torch.tensor([[8,100,100,100], [100,7,100,100], [100,100,5,100], [100,100,100,9]])
g_idx = torch.tensor([i//2 for i in range(4)])
perm = torch.argsort(torch.diag(H), descending=True)
# [3, 0, 1, 2]
invperm = torch.argsort(perm)
# [1, 2, 3, 0]
g_idx = g_idx[invperm]
# [0, 1, 1, 0]
 
if self.maxq < 0:
    self.scale = xmax
    self.zero = xmin
else:
    self.scale = (xmax - xmin) / self.maxq
    if self.sym:
        self.zero = torch.full_like(self.scale, (self.maxq + 1) / 2)
    else:
        self.zero = torch.round(-xmin / self.scale)
```
</details>

In [None]:
# https://github.com/AutoGPTQ/AutoGPTQ/blob/main/auto_gptq/nn_modules/qlinear/qlinear_cuda.py
def load_gptq_tensor(reader, name):
    layer = name.split('.')[1] # f'blk.{layer}.xxx'
    prefix = f'model.layers.{layer}.'

    tensor = pt_get_tensor(reader, prefix, name)
    if tensor is not None:
        return tensor

    for k in name_map:
        if k in name:
            pt_name = prefix+name_map[k]+ '_proj.'
            break

    qzeros  = reader.get_tensor(pt_name+'qzeros')
    qweight = reader.get_tensor(pt_name+'qweight')
    g_idx   = reader.get_tensor(pt_name+'g_idx')
    scales  = reader.get_tensor(pt_name+'scales')

    # dequantize
    wf = torch.tensor(list(range(0, 32, 4)), dtype=torch.int32).unsqueeze(0)
    zeros = torch.bitwise_right_shift(torch.unsqueeze(qzeros, 2).expand(-1, -1, 8), wf.unsqueeze(0)).to(torch.int8)
    zeros = torch.bitwise_and(zeros, 0xf)
    zeros = zeros + 1 # ohhhhhhhhh
    zeros = zeros.reshape(scales.shape)

    weight = torch.bitwise_right_shift(torch.unsqueeze(qweight, 1).expand(-1, 8, -1), wf.unsqueeze(-1)).to(torch.int8)
    weight = torch.bitwise_and(weight, 0xf)
    weight = weight.reshape(weight.shape[0] * weight.shape[1], weight.shape[2])

    weight = scales[g_idx.long()] * (weight - zeros[g_idx.long()])

    return inverse_permute(name, weight.float().T)

In [None]:
def load_autoround_tensor(reader, name):
    layer = name.split('.')[1] # f'blk.{layer}.xxx'
    prefix = f'model.layers.{layer}.'

    tensor = pt_get_tensor(reader, prefix, name)
    if tensor is not None:
        return tensor

    for k in name_map:
        if k in name:
            pt_name = prefix+name_map[k]+ '_proj.'
            break

    qzeros  = reader.get_tensor(pt_name+'qzeros')
    qweight = reader.get_tensor(pt_name+'qweight')
    scales  = reader.get_tensor(pt_name+'scales')

    # dequantize
    group_size = 128
    wf = torch.tensor(list(range(0, 32, 4)), dtype=torch.int32).unsqueeze(0)
    zeros = torch.bitwise_right_shift(torch.unsqueeze(qzeros, 2).expand(-1, -1, 8), wf.unsqueeze(0)).to(torch.int8)
    zeros = torch.bitwise_and(zeros, 0xf)
    zeros = zeros + 1 # ohhhhhhhhh
    zeros = zeros.reshape(-1, 1, zeros.shape[1] * zeros.shape[2])

    weight = torch.bitwise_right_shift(torch.unsqueeze(qweight, 1).expand(-1, 8, -1), wf.unsqueeze(-1)).to(torch.int8)
    weight = torch.bitwise_and(weight, 0xf)
    weight = weight.reshape(-1, group_size, weight.shape[2])
    
    scales = scales.reshape(-1, 1, scales.shape[-1])
    weight = scales * (weight - zeros) # by intristic broadcast
    weight = weight.reshape(weight.shape[0] * weight.shape[1], weight.shape[2])

    return inverse_permute(name, weight.float().T)

# Model file

## GGUF

In [None]:
gguf_file = "models/TinyStories-656K.Q4_K_S.gguf"
huggingface_repo = "https://huggingface.co/mradermacher/TinyStories-656K-GGUF/resolve/main/"

In [None]:
gguf_file = "models/SmolLM2-135M-Instruct-Q8_0.gguf"
huggingface_repo = "https://huggingface.co/bartowski/SmolLM2-135M-Instruct-GGUF/resolve/main/"

In [None]:
gguf_file = "models/Llama-3.2-1B-Instruct-IQ4_XS.gguf"
huggingface_repo = "https://huggingface.co/bartowski/Llama-3.2-1B-Instruct-GGUF/resolve/main/"

In [None]:
gguf_file = "models/Meta-Llama-3.1-8B-Instruct-Q8_0.gguf"
huggingface_repo = "https://huggingface.co/bartowski/Llama-3.1-8B-Instruct-GGUF/resolve/main/"

In [None]:
if not os.path.exists(gguf_file):
    print("file not found, download from internet...")
    subprocess.run(["wget", "-O", gguf_file, huggingface_repo+gguf_file[7:]])

gguf_reader = GGUFReader(gguf_file, 'r')

metadata = {}
for key, field in gguf_reader.fields.items():
    metadata[key] = field.parts[field.data[0]][0]

vocab_size  = metadata['llama.vocab_size']
hidden_size = metadata['llama.embedding_length']
n_blocks    = metadata['llama.block_count']
n_heads     = metadata['llama.attention.head_count']
n_kv_heads  = metadata['llama.attention.head_count_kv']
rope_theta  = metadata['llama.rope.freq_base']
norm_eps    = metadata['llama.attention.layer_norm_rms_epsilon']
n_dims      = metadata['llama.rope.dimension_count']
n_tensors   = metadata['GGUF.tensor_count']
n_layer     = 0

tensor_idx = {}
for i in range(n_tensors):
    tensor_idx[gguf_reader.get_tensor(i).name] = i
gguf_file

## AWQ/GPTQ

<details>
<summary>analysis</summary>

```python
tensor_names = awq_reader.keys()
for name in tensor_names:
    tensor = awq_reader.get_tensor(name)
    print(f"name: {name}")
    print(f"shape: {tensor.shape}")
    print(f"type: {tensor.dtype}")
    # print(f"tensor:\n{tensor}")
    print("-" * 50)

# 讲个鬼故事，gguf和awq/gptq读出来的tensor是互为转置的
# 而且gguf的tensor排列是和原版的meta发布的模型权重次序是一致的
# awq/gptq则和huggingface中的保持一致，因为awq/gptq也是作为huggingface生态的一部分
print(gguf_reader.get_tensor(tensor_idx[f'blk.{0}.ffn_down.weight']).shape)
print(awq_reader.get_tensor(f'model.layers.{0}.mlp.down_proj.qweight').shape)
print(gptq_reader.get_tensor(f'model.layers.{0}.mlp.down_proj.qweight').shape)

print(load_gguf_tensor(gguf_reader, 'output_norm.weight'))
print(awq_reader.get_tensor('model.norm.weight'))
print(gptq_reader.get_tensor('model.norm.weight'))

layer=0
print(load_gguf_tensor(gguf_reader, f'blk.{layer}.attn_k.weight').shape)
print(load_awq_tensor(awq_reader, f'blk.{layer}.attn_k.weight').shape)
print(load_gptq_tensor(gptq_reader, f'blk.{layer}.attn_k.weight').shape)
```
    
</details>

In [None]:
url = "llmc generate"
model_path = "models/Llama-3.2-1B-Instruct-llmc-awq.safetensors"
output_gguf = "Llama-3.2-1B-Instruct-llmc-awq.gguf"
load_tensor = load_fakequant_tensor

In [None]:
url = "https://huggingface.co/AMead10/Llama-3.2-1B-Instruct-AWQ/resolve/main/model.safetensors"
model_path = "models/Llama-3.2-1B-Instruct-AWQ.safetensors"
output_gguf = "Llama-3.2-1B-Instruct-AWQ.gguf"
load_tensor = load_awq_tensor

In [None]:
# intel的gptq/autoround系列反量化后的值是一样的，就是格式不同而已
url = "https://huggingface.co/fbaldassarri/meta-llama_Llama-3.2-1B-Instruct-auto_awq-int4-gs128-asym/resolve/main/model.safetensors"
model_path = "models/Llama-3.2-1B-Instruct-auto_awq-int4-gs128-asym.safetensors"
output_gguf = "Llama-3.2-1B-Instruct-auto_awq-int4-gs128-asym.gguf"
load_tensor = load_awq_tensor

In [None]:
url = "https://huggingface.co/fbaldassarri/meta-llama_Llama-3.2-1B-Instruct-auto_awq-int4-gs128-sym/resolve/main/model.safetensors"
model_path = "models/Llama-3.2-1B-Instruct-auto_awq-int4-gs128-sym.safetensors"
output_gguf = "Llama-3.2-1B-Instruct-auto_awq-int4-gs128-sym.gguf"
load_tensor = load_awq_tensor

In [None]:
url = "https://huggingface.co/ModelCloud/Llama-3.2-1B-Instruct-gptqmodel-4bit-vortex-v2.5/resolve/main/model.safetensors"
model_path = "models/Llama-3.2-1B-Instruct-GPTQ-g32.safetensors"
output_gguf = "Llama-3.2-1B-Instruct-GPTQ-g32.gguf"
load_tensor = load_gptq_tensor

In [None]:
url = "https://huggingface.co/shuyuej/Llama-3.2-1B-Instruct-GPTQ/resolve/main/model.safetensors"
model_path = "models/Llama-3.2-1B-Instruct-GPTQ-g128.safetensors"
output_gguf = "Llama-3.2-1B-Instruct-GPTQ-g128.gguf"
load_tensor = load_gptq_tensor

In [None]:
url = "https://huggingface.co/Almheiri/Llama-3.2-1B-Instruct-GPTQ-INT4/resolve/main/model.safetensors"
model_path = "models/Llama-3.2-1B-Instruct-GPTQ-INT4.safetensors"
output_gguf = "Llama-3.2-1B-Instruct-GPTQ-INT4.gguf"
load_tensor = load_gptq_tensor

In [None]:
if not os.path.exists(model_path):
    print("file not found, download from internet...")
    subprocess.run(["wget", "-O", model_path, url])

reader = safe_open(model_path, framework="pt")
model_path

In [None]:
t = 10
print(gguf_reader.get_tensor(tensor_idx['token_embd.weight']).data[t])
print(awq_reader.get_tensor('model.embed_tokens.weight')[t])
print(awq_reader.get_tensor('lm_head.weight')[t])
print(gptq_reader.get_tensor('model.embed_tokens.weight')[t])
# print(gptq_reader.get_tensor('lm_head.weight')[t])

### Running demo

In [None]:
%env HF_ENDPOINT=https://hf-mirror.com
%env HF_HUB_ENABLE_HF_TRANSFER=1
# !pip install -U transformers peft accelerate optimum auto-gptq autoawq
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
# question = "火影忍者的作者是谁？" # Q5_K_S/Q4_K_S/IQ4_XS/Q3_K_XL will be failed
# question = "Naruto的作者是谁？" # Q5/IQ4_XS/Q3_K_XL will be failed
question = "Who is the author of 'Chainsaw Man'?" # Q4_0/IQ3_M/Q3_K_XL will be failed

In [None]:
# model_path = "AMead10/Llama-3.2-1B-Instruct-AWQ"
# model_path = "Almheiri/Llama-3.2-1B-Instruct-GPTQ-INT4"
model_path = "ModelCloud/Llama-3.2-1B-Instruct-gptqmodel-4bit-vortex-v2.5"

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForCausalLM.from_pretrained(model_path, torch_dtype="auto", device_map="cuda")

prompt = [
    {"role": "system", "content": "\n\nYou are a helpful assistant"},
    {"role": "user", "content": question},
]

input_tensor = tokenizer.apply_chat_template(prompt, add_generation_prompt=True, return_tensors="pt")

outputs = model.generate(input_ids=input_tensor.to(model.device), max_new_tokens=512, do_sample=False)
# result = tokenizer.decode(outputs[0][input_tensor.shape[1]:], skip_special_tokens=True)
result = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(outputs[0])

## Write a new GGUF file

In [None]:
from tqdm import tqdm
from typing import Any, Sequence, NamedTuple
from gguf import GGUFWriter
from gguf.constants import GGMLQuantizationType

class MetadataDetails(NamedTuple):
    type: gguf.GGUFValueType
    value: Any
    description: str = ''

def get_field_data(reader: gguf.GGUFReader, key: str) -> Any:
    field = reader.get_field(key)
    # seems that remote colab cannot work: AttributeError: 'ReaderField' object has no attribute 'contents'
    return field.contents() if field else None

In [None]:
arch = get_field_data(gguf_reader, gguf.Keys.General.ARCHITECTURE)
writer = gguf.GGUFWriter(output_gguf, arch=arch, endianess=gguf_reader.endianess)
alignment = get_field_data(gguf_reader, gguf.Keys.General.ALIGNMENT)
assert alignment is None

for field in gguf_reader.fields.values():
    # Suppress virtual fields and fields written by GGUFWriter
    if field.name == gguf.Keys.General.ARCHITECTURE or field.name.startswith('GGUF.'):
        continue
    val = MetadataDetails(field.types[0], field.contents())
    if val.value is not None:
        writer.add_key_value(field.name, val.value, val.type)

total_bytes = 0

for tensor in gguf_reader.tensors:
    if 'blk' in tensor.name[:3] and '_norm' not in tensor.name:
        data = load_tensor(reader, tensor.name)
        dim1, dim2 = data.shape
        nbytes = dim1 * dim2 * 2
        writer.add_tensor_info(tensor.name, (dim1, dim2), np.float16(1.0).dtype, nbytes, GGMLQuantizationType.F16)
        total_bytes += nbytes
    else:
        total_bytes += tensor.n_bytes
        writer.add_tensor_info(tensor.name, tensor.data.shape, tensor.data.dtype, tensor.data.nbytes, tensor.tensor_type)

bar = tqdm(desc="Writing", total=total_bytes, unit="byte", unit_scale=True)
writer.write_header_to_file()
writer.write_kv_data_to_file()
writer.write_ti_data_to_file()

for tensor in gguf_reader.tensors:
    if 'blk' in tensor.name[:3]:
        data = load_tensor(reader, tensor.name)
        data_type = torch.float32 if '_norm' in tensor.name else torch.float16
        writer.write_tensor_data(data.to(data_type).numpy())
        if '_norm' in tensor.name:
            bar.update(tensor.n_bytes)
        else:
            bar.update(data.shape[0] * data.shape[1] * 2)
    else:
        writer.write_tensor_data(tensor.data)
        bar.update(tensor.n_bytes)

writer.close()