In [None]:
# for find out more details if needed
%set_env LLAMA_LOG_VERBOSITY=4

In [None]:
# use LLAMA_SET_ROWS to reduce rebuild graph time, in order to improve the performance
# FIXME: running perplexity has something wrong!
%set_env LLAMA_SET_ROWS=1

In [None]:
%set_env CUDA_VISIBLE_DEVICES=0

In [None]:
%set_env GGML_CUDA_DISABLE_GRAPHS=1

# Quick build

In [None]:
#build for cpu debug
%cd llama.cpp
!cmake -B cpu_debug -DGGML_CUDA=OFF -DGGML_RPC=OFF -DGGML_BLAS=OFF \
    -DGGML_SCHED_MAX_COPIES=1 -DLLAMA_CURL=OFF -DCMAKE_BUILD_TYPE=Debug
!cmake --build cpu_debug --config Debug -j $(nproc)
%cd ..

In [None]:
#build for gpu debug
%cd llama.cpp
!cmake -B cuda_debug -DGGML_CUDA=ON -DGGML_RPC=OFF -DGGML_BLAS=OFF -DCMAKE_CUDA_ARCHITECTURES="80;86;89" \
    -DGGML_SCHED_MAX_COPIES=1 -DLLAMA_CURL=OFF -DCMAKE_BUILD_TYPE=Debug
!cmake --build cuda_debug --config Debug -j $(nproc)
%cd ..

In [None]:
#build for cpu execution
%cd llama.cpp
!cmake -B cpu_build -DGGML_CUDA=OFF -DGGML_RPC=OFF -DGGML_BLAS=OFF \
    -DGGML_SCHED_MAX_COPIES=1 -DLLAMA_CURL=OFF
!cmake --build cpu_build --config Release -j $(nproc)
%cd ..

In [None]:
#build for gpu execution
%cd llama.cpp
!cmake -B cuda_build -DGGML_CUDA=ON -DGGML_RPC=OFF -DGGML_BLAS=OFF -DCMAKE_CUDA_ARCHITECTURES="80;86;89" \
    -DGGML_SCHED_MAX_COPIES=1 -DLLAMA_CURL=OFF
!cmake --build cuda_build --config Release -j $(nproc)
%cd ..

In [None]:
!git clone https://github.com/ikawrakow/ik_llama.cpp --depth 1
%cd ik_llama.cpp
!cmake -B cuda_build -DGGML_CUDA=ON -DGGML_RPC=OFF -DGGML_BLAS=OFF -DGGML_SCHED_MAX_COPIES=1 -DLLAMA_CURL=OFF
!cmake --build cuda_build --config Release -j $(nproc)
%cd ..

# 42 test

- [TinyStories-656K](https://hf-mirror.com/mradermacher/TinyStories-656K-GGUF)

<details>
<summary>config</summary>

```json
{
  "architectures": [
    "LlamaForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "bos_token_id": 1,
  "eos_token_id": 2,
  "hidden_act": "silu",
  "hidden_size": 128,
  "initializer_range": 0.02,
  "intermediate_size": 384,
  "max_position_embeddings": 512,
  "mlp_bias": false,
  "model_type": "llama",
  "num_attention_heads": 8,
  "num_hidden_layers": 2,
  "num_key_value_heads": 4,
  "pretraining_tp": 1,
  "rms_norm_eps": 1e-06,
  "rope_scaling": null,
  "rope_theta": 10000.0,
  "tie_word_embeddings": true,
  "torch_dtype": "bfloat16",
  "transformers_version": "4.41.2",
  "use_cache": true,
  "vocab_size": 2048
}
```
</details>

In [None]:
model = "models/TinyStories-656K.f16.gguf"
assert os.path.exists(model)
!llama.cpp/cpu_build/bin/llama-cli -m {model} -p "the answer to the ultimate question of life, the universe, and everything is 42. One day, they " \
-t 1 -n 2 --temp 0 --top-k 0 --top-p 1.0 --min-p 0.0 2>/dev/null

# Hello world test

- [SmolLM2-135M](https://hf-mirror.com/bartowski/SmolLM2-135M-Instruct-GGUF/tree/main)

<details>
<summary>Prompt format</summary>


```
<|im_start|>system
{system_prompt}<|im_end|>
<|im_start|>user
{prompt}<|im_end|>
<|im_start|>assistant
```

</details>

In [None]:
prefix = "<|im_start|>system\nYou are a helpful assistant<|im_end|>\n<|im_start|>user\n"
suffix = "<|im_end|>\n<|im_start|>assistant\n"
# Q4_K_L == Q4_K_M, Q5_K_L == Q5_K_M
model = "models/SmolLM2-135M-Instruct-f16.gguf"
assert os.path.exists(model)

In [None]:
question = "Give the most simple C program to print 'hello world!'" # Q4_K_S or under will be failed
prompt = prefix + question + suffix
!llama.cpp/cpu_build/bin/llama-cli -m {model} -p "{prompt}" -no-cnv -t 1 --temp 0 --top-k 0 --top-p 1.0 --min-p 0.0 2>/dev/null

In [None]:
question = "Give the most simple C program to print 'Love world!'" # not work for Q8_0/Q6_K, Q5_K_S is the lowest, Q4 or under will be failed
# question = "Give the most simple C program to print 'love world!'" # Q4 or under will be failed
prompt = prefix + question + suffix
!llama.cpp/cpu_build/bin/llama-cli -m {model} -p "{prompt}" -no-cnv -t 1 --temp 0 --top-k 0 --top-p 1.0 --min-p 0.0 2>/dev/null

# Manga auther test

- [Llama-3.2-1B](https://hf-mirror.com/bartowski/Llama-3.2-1B-Instruct-GGUF/tree/main)
- [Official text_prompt_format](https://github.com/meta-llama/llama-models/blob/main/models/llama3_2/text_prompt_format.md)

<details>
<summary>Prompt format</summary>

```
<|begin_of_text|><|start_header_id|>system<|end_header_id|>

Cutting Knowledge Date: December 2023
Today Date: 26 Jul 2024

{system_prompt}<|eot_id|><|start_header_id|>user<|end_header_id|>

{prompt}<|eot_id|><|start_header_id|>assistant<|end_header_id|>
```


```python
# Q4_K_M/S 展现出了随着prefix和suffix微小变化的不稳定性
# prefix = "<|start_header_id|>system<|end_header_id|>\n\nCutting Knowledge Date: December 2023\nToday Date: 26 Jul 2024\n\nYou are a helpful assistant<|eot_id|><|start_header_id|>user<|end_header_id|>\n\n"
# prefix = "<|start_header_id|>system<|end_header_id|>\n\nCutting Knowledge Date: December 2023\nToday Date: 26 Dec 2024\n\nYou are a helpful assistant<|eot_id|><|start_header_id|>user<|end_header_id|>\n\n"
# prefix = "<|start_header_id|>system<|end_header_id|>\n\nCutting Knowledge Date: December 2023\nToday Date: 26 Oct 2024\n\nYou are a helpful assistant<|eot_id|><|start_header_id|>user<|end_header_id|>\n\n"
# suffix = "<|eot_id|><|start_header_id|>assistant<|end_header_id|>" # Q4_K_M only corrent for 火影忍者的作者是谁？ pair with Jul 2024
```

</details>

In [None]:
# Q4_0/awq will be failed
question = "Who is the author of 'Demon Slayer'?"
answer = "The author of the popular manga and anime series 'Demon Slayer: Kimetsu no Yaiba' is Koyoharu Gotoge. [end of text]"

In [None]:
# awq will be failed
question = "Who is the author of 'Chainsaw Man'?"
answer = "The author of the manga and anime series 'Chainsaw Man' is Tatsuki Fujimoto. [end of text]"

In [None]:
# [awq] will be failed
question = "Who is the author of 'Detective Conan'?"
answer = "The author of the popular manga and anime series 'Detective Conan' is Gosho Aoyama. [end of text]"

In [None]:
# under-4-bit will be failed
question = "Who is the author of Manga 'Slam Dunk'?"
answer = 'The author of the popular manga series "Slam Dunk" is Takehiko Inoue. [end of text]'

In [None]:
# under-4-bit will be failed
question = "Who is the author of 'Berserk'?"
answer = 'The author of the manga and anime series "Berserk" is Kentaro Miura. [end of text]'

In [None]:
# **Q5_K_M/Q5_K_S**/[Q4_K_M/Q4_K_S]/IQ4_XS will be failed
question = "Naruto的作者是谁？"
answer = "Naruto的作者是Masashi Kishimoto [end of text]"

In [None]:
# [Q4_K_M/Q4_K_S]/IQ4_XS/awq/gptq will be failed
question = "火影忍者的作者是谁？"
answer = "火影忍者是由Masashi Kishimoto所创作的日本动画和漫画。 [end of text]"

In [None]:
if "Qwen3" in model_name:
    prefix = "<|im_start|>user\n"
    suffix = " /no_think<|im_end|>\n<|im_start|>assistant"
    # suffix = " /think<|im_end|>\n<|im_start|>assistant"
else: # Llama-3
    prefix = "<|start_header_id|>user<|end_header_id|>\n\n"
    suffix = "<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n"
prompt = prefix + question + suffix
!llama.cpp/cpu_build/bin/llama-cli -m {model} -p "{prompt}" -c 2048 -n 512 --temp 0 --top-k 1 --seed 42 -ngl 100 -no-cnv 2>/dev/null
print("\n"+answer)
model

# Quantization

In [None]:
import os
# model_name = "Llama-3.2-1B-Instruct"
# model_name = "Llama-3.1-8B-Instruct"
# model_name = "Llama-3.3-70B-Instruct"
# model_name = "Llama-3_3-Nemotron-Super-49B-v1_5"
# model_name = "DeepSeek-R1-Distill-Llama-70B"
# model_name = "DeepSeek-R1-Distill-Qwen-14B"
# model_name = "DeepSeek-R1-Distill-Qwen-32B"
# model_name = "Qwen3-0.6B"
# model_name = "Qwen3-8B"
# model_name = "Qwen3-14B"
model_name = "Qwen3-32B"
# model_name = "gemma-3-12b-it"
# model_name = "gemma-3-27b-it"
# model_name = "phi-4"
# model_name = "Phi-4-reasoning-plus"
# model_name = "Mistral-Small-3.2-24B-Instruct-2506"
# model_name = "Magistral-Small-2509"

hf_model = f"models/{model_name}"
assert os.path.exists(hf_model)
model = f"models/{model_name}.gguf"
bf16_model = f"models/{model_name}-BF16.gguf"
quant_option = "--token-embedding-type q6_K --output-tensor-type q6_K"

model_name

In [None]:
!python llama.cpp/convert_hf_to_gguf.py --outtype bf16 {hf_model} --outfile {model}
!ik_llama.cpp/cuda_build/bin/llama-quantize {quant_option} {model} {bf16_model} bf16

## gguf

<details>
<summary>imatrix modification for dumping activations</summary>

```diff
diff --git a/tools/imatrix/imatrix.cpp b/tools/imatrix/imatrix.cpp
index cf07d99d..21678d82 100644
--- a/tools/imatrix/imatrix.cpp
+++ b/tools/imatrix/imatrix.cpp
@@ -37,9 +37,17 @@ static const char * const LLM_KV_IMATRIX_DATASETS    = "imatrix.datasets";
 static const char * const LLM_KV_IMATRIX_CHUNK_COUNT = "imatrix.chunk_count";
 static const char * const LLM_KV_IMATRIX_CHUNK_SIZE  = "imatrix.chunk_size";
 
+#define DUMP_ACTIVATION 0
+#define DUMP_THRESHOLDS 1
 struct Stats {
     std::vector<float>   values;
     std::vector<int64_t> counts;
+#if DUMP_ACTIVATION || DUMP_THRESHOLDS
+    std::vector<float> activations;
+#endif
+#if DUMP_THRESHOLDS
+    std::vector<float> top_k;
+#endif
 };
 
 struct tensor_statistics {
@@ -216,6 +224,26 @@ static void compute_cossim(std::vector<tensor_statistics> & tstats) {
     }
 }
 
+#if DUMP_THRESHOLDS
+static void bucket_process(const std::vector<float>& arr, std::vector<float>& top_k) {
+    const size_t CHUNK = 1024;
+    size_t N = arr.size();
+    size_t num_groups = N / CHUNK;
+
+    for (size_t g = 0; g < num_groups; g++) {
+        const float* base = arr.data() + g * CHUNK;
+
+        float tmp[CHUNK];
+        std::copy(base, base + CHUNK, tmp);
+
+        int pos_median = CHUNK/2;
+        std::nth_element(tmp, tmp + pos_median, tmp + CHUNK);
+        float median = tmp[pos_median];
+        top_k.push_back(median);
+    }
+}
+#endif
+
 bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void * user_data) {
     GGML_UNUSED(user_data);
 
@@ -340,6 +368,12 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
         if (e.values.empty()) {
             e.values.resize(src1->ne[0] * n_mat, 0);
             e.counts.resize(n_mat, 0);
+#if DUMP_ACTIVATION
+            e.activations.resize(src1->ne[0]*src1->ne[1]);
+#elif DUMP_THRESHOLDS
+            e.activations.resize(src1->ne[0]);
+            e.top_k.clear();
+#endif
         }
         else if (e.values.size() != (size_t)(src1->ne[0] * n_mat)) {
             LOG_ERR("%s: inconsistent size for %s (%d vs %d)\n", __func__, wname.c_str(), (int)e.values.size(), (int)(src1->ne[0] * n_mat));
@@ -349,6 +383,9 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
             LOG_ERR("%s: inconsistent expert count for %s (%d vs %d)\n", __func__, wname.c_str(), (int)e.counts.size(), (int)n_mat);
             exit(1); //GGML_ABORT("fatal error");
         }
+#if DUMP_THRESHOLDS
+        e.activations.assign(e.activations.size(), 0.0f);
+#endif
         LOG_DBGV(2, "%s[%d]: %32s, %s, %5d x %5d x %5d, %d\n", __func__, m_last_chunk, wname.c_str(), ggml_op_name(t->op), (int)src1->ne[0], (int)src1->ne[1], (int)src1->ne[2], (int)src1->type);
         for (int64_t i3 = 0; i3 < src1->ne[3]; ++i3) {
             for (int64_t i2 = 0; i2 < src1->ne[2]; ++i2) {
@@ -358,13 +395,29 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
                 for (int64_t row = 0; row < src1->ne[1]; ++row) {
                     const float * x = (const float *) (data + row * src1->nb[1] + i2 * src1->nb[2] + i3 * src1->ne[3]);
                     e.counts[mat_id]++;
+#if DUMP_ACTIVATION
+                    auto activations = e.activations.data() + i11*src1->ne[0];
+#elif DUMP_THRESHOLDS
+                    auto activations = e.activations.data();
+#endif
                     for (int64_t j = 0; j < src1->ne[0]; ++j) {
                         e.values[mat_start + j] += x[j] * x[j];
+#if DUMP_ACTIVATION
+                        activations[j] = x[j]*x[j];
+#elif DUMP_THRESHOLDS
+                        activations[j] += x[j]*x[j];
+#endif
                         if (!std::isfinite((float)e.values[j])) {
                             LOG_ERR("%f detected in %s\n", (float)e.values[j], wname.c_str());
                             exit(1);
                         }
                     }
+#if DUMP_THRESHOLDS
+                    if (row%5 == 4) {
+                        bucket_process(e.activations, e.top_k);
+                        e.activations.assign(e.activations.size(), 0.0f);
+                    }
+#endif
                 }
                 const int32_t n_chunk = e.counts[mat_id] / chunk_size;
                 if (n_chunk > m_last_chunk) {
@@ -466,6 +519,21 @@ void IMatrixCollector::save_imatrix_legacy(int32_t ncall) const {
                 tmp[i] = (value / count) * static_cast<float>(ncall);
             }
             out.write((const char *) tmp.data(), nval * sizeof(float));
+#if DUMP_ACTIVATION
+            int nact = stat.activations.size();
+            out.write((const char *) &nact, sizeof(nact));
+            out.write((const char*)stat.activations.data(), nact*sizeof(float));
+#elif DUMP_THRESHOLDS
+            size_t index0 = std::ceil(0.1 * stat.top_k.size());
+            std::vector<float> temp0 = stat.top_k;
+            std::nth_element(temp0.begin(), temp0.begin() + index0, temp0.end());
+            out.write((const char *)&temp0[index0], sizeof(float));
+
+            size_t index1 = std::ceil(0.9 * stat.top_k.size());
+            std::vector<float> temp1 = stat.top_k;
+            std::nth_element(temp1.begin(), temp1.begin() + index1, temp1.end());
+            out.write((const char *)&temp1[index1], sizeof(float));
+#endif
         }
     }
 

```

</details>

In [None]:
# ~~Get the imatrix file~~
# now, we used llama-imatrix to get median info of each layers about 20 minutes for Qwen3-32B BF16
calib_file = f"calibration_data/calibration_datav3.txt"
imatrix_file = f"models/{model_name}.med"
# to use cuda backend, otherwise will be very very slow
!llama.cpp/cuda_build/bin/llama-imatrix -m {bf16_model} -f {calib_file} --output-file {imatrix_file} -ngl 100

In [None]:
# ---------------------------
# quant_type = "exl3-2.0bpw"
# quant_type = "exl3-2.25bpw"
# quant_type = "IQ2_KT"
quant_type = "IQ2_K"
# quant_type = "IQ2_M"

#---------------------------
# quant_type = "IQ3_XXS"
# quant_type = "IQ3_KT"
# quant_type = "exl3-3.0bpw"

#---------------------------
# quant_type = "Q4_0"
# quant_type = "IQ4_XS"
# quant_type = "Q4_K_M"

#---------------------------
# quant_type += "-IQ2_KS"
# quant_type += "-dequant"

model = f"models/{model_name}-{quant_type}.gguf"
model

In [None]:
# Get the quant model
# quant_option += " --pure"
# use cpu_build is also ok because only cpu is needed
assert os.path.exists(bf16_model)
# we download unsloth provided quanted model instead
assert quant_type not in ["IQ2_XXS", "IQ2_M"]

# imatrix_file = ""
imatrix_file = f"--imatrix models/imatrix_unsloth-{model_name}.dat"
assert os.path.exists(f"models/imatrix_unsloth-{model_name}.dat")
# if quant old type, llama.cpp is prefered, instead of ik_
# !llama.cpp/cuda_build/bin/llama-quantize {quant_option} {imatrix_file} {bf16_model} {model} {quant_type}
!ik_llama.cpp/cuda_build/bin/llama-quantize {quant_option} {imatrix_file} {bf16_model} {model} {quant_type}
model

In [None]:
# use gguf_dump.py to check quant_model
# the gguf_dump.py file is in (ik_)llama.cpp/scripts folder
assert os.path.exists(model)
gguf_dump_tool = "llama.cpp/gguf-py/gguf/scripts/gguf_dump.py"
!python {gguf_dump_tool} --markdown {model}

<details>
<summary>perplexity results</summary>

```bash
wc -c < models/Llama-3.1-8B-Instruct-IQ2_K.gguf | awk '{printf "%.2f bpw\n", ($1/1024/1024-7.5-410.98*2)/(14134.97-410.98*2)*16}'
wc -c < models/Qwen3-8B-IQ2_K.gguf | awk '{printf "%.2f bpw\n", ($1/1024/1024-7.5-486.86*2)/(14222.89-486.86*2)*16}'
wc -c < models/gemma-3-12b-it-IQ2_K.gguf | awk '{printf "%.2f bpw\n", ($1/1024/1024-7.5-787.69)/(21310.61-787.69)*16}'

wc -c < models/Llama-3_3-Nemotron-Super-49B-v1_5-IQ2_K.gguf | awk '{printf "%.2f bpw\n", ($1/1024/1024-7.5-821.953*2)/(92751.97-821.953*2)*16}'
wc -c < models/Llama-3.3-70B-Instruct-IQ2_K.gguf | awk '{printf "%.2f bpw\n", ($1/1024/1024-7.5-821.95*2)/(132208.94-821.95*2)*16}'
wc -c < models/Qwen3-14B-IQ2_K.gguf | awk '{printf "%.2f bpw\n", ($1/1024/1024-7.5-608.57*2)/(26418.76-608.57*2)*16}'
wc -c < models/Qwen3-32B-IQ2_K.gguf | awk '{printf "%.2f bpw\n", ($1/1024/1024-7.5-608.57*2)/(60739.72-608.57*2)*16}'
wc -c < models/gemma-3-27b-it-IQ2_K.gguf | awk '{printf "%.2f bpw\n", ($1/1024/1024-7.5-1102.77)/(49932.94-1102.77)*16}'
wc -c < models/Phi-4-reasoning-plus-IQ2_K.gguf | awk '{printf "%.2f bpw\n", ($1/1024/1024-7.5-401.95*2)/(26805.49-401.95*2)*16}'
wc -c < models/phi-4-IQ2_K.gguf | awk '{printf "%.2f bpw\n", ($1/1024/1024-7.5-401.95*2)/(26805.49-401.95*2)*16}'
wc -c < models/Mistral-Small-3.2-24B-Instruct-2506-IQ2_K.gguf | awk '{printf "%.2f bpw\n", ($1/1024/1024-7.5-525.00*2)/(43451.58-525.00*2)*16}'
wc -c < models/Magistral-Small-2509-IQ2_K.gguf | awk '{printf "%.2f bpw\n", ($1/1024/1024-7.5-525.00*2)/(43451.58-525.00*2)*16}'
wc -c < models/GLM-Z1-32B-0414-IQ2_K.gguf | awk '{printf "%.2f bpw\n", ($1/1024/1024-7.5-728.438*2)/(60022.62-728.438*2)*16}'
```

</details>

quant     | Lm70B  |DsLm70B| bpw| Lm49B  | bpw| Qw32B  | bpw| Gm27B  | bpw| Mis24B | Mag24B | bpw| Qw14B  | bpw| Phi14BR| bpw| phi14B | bpw
---       | ---    |---    | ---| ---    | ---| ---    | ---| ---    | ---| ---    | ---    | ---| ---    | ---| ---    | ---| ---    | ---
BF16      | 3.9195 | 6.4641| 16 | 8.9911 | 16 | 7.8999 | 16 | 8.4254 | 16 | 5.3768 | 5.5103 | 16 | 9.0137 | 16 | 7.2447 | 16 | 6.5365 | 16 
|         |**E6H6**|       |    |**E6H6**|    |**E6H6**|    |**E6H6**|    |**E6H6**|        |    |**E6H6**|    |**E6H6**|    |**E6H6**|    
IQ2_M     |        |       |    |        |    |        |    |        |    |        |        |    | 10.0184|2.75|  9.2491|2.70| 7.9992 |2.66
IQ2_K     |        |       |    | 11.7787|2.45|  9.3806|2.49|  9.7112|2.49| 11.9114| 10.5590|2.44| 10.6293|2.49|  9.3127|2.46| 8.0229 |2.46
IQ2_KT    | 6.6194 | 8.6518|2.30| 12.4296|2.29|  9.7530|2.31| 10.1012|2.28| 14.1785| 12.9880|2.25| 10.9093|2.32|  9.0545|2.42| 7.8496 |2.42
exl3-2.25 | 6.1918 | 8.3365|2.25|  9.3429|2.25|  8.9215|2.25|  9.6186|2.25|  6.6008|  6.6564|2.25|        |    |        |    |        |    
exl3-2.0  | 6.6928 | 8.9645|2.00|  9.8139|2.00|  9.2934|2.00| 10.8345|2.00|  6.8854|  7.0594|2.00|        |    |        |    |        |    
|         |**E6H6**|       |    |**E6H6**|    |**E6H6**|    |**E6H6**|    |**E6H6**|        |    |**E6H6**|    |**E6H6**|    |**E6H6**|    
Q4_K_M    |*4.0955*| 6.5746|4.79|*8.1214*|4.81| 7.9957 |4.79|*8.4811*|4.81|*5.4899*| 5.6145 |4.82| 9.2685 |4.79|*7.3020*|4.78| 6.6033 |4.89
Q4_0      | 4.2788 | 6.6847|4.52|  8.3147|4.52| 8.0018 |4.52| 8.5661 |4.52| 5.5677 | 5.6938 |4.52| 9.1585 |4.52| 7.4037 |4.52| 6.6366 |4.51
IQ4_XS    | 4.1613 | 6.7978|4.26|  8.1394|4.22| 7.9858 |4.26| 8.4654 |4.25| 5.5227 | 5.6457 |4.26| 9.1024 |4.27| 7.3601 |4.25| 6.6109 |4.25
|           |**E6H6**|        | |        |    |**E6H6**|    |**E6H6**|    |**E6H6**|        |    |**E6H6**|    |**E6H6**|    |**E6H6**|     
IQ2_M_KS    |        |        | |        |    |        |    |        |    |        |        |    | 9.0882 |    | 7.3299 |    | 6.6271 |    
IQ2_K_KS    |        |        | |*8.2164*|    | 7.9724 |    | 8.4961 |    | 5.5148 | 5.6314 |    | 9.1922 |    | 7.3707 |    | 6.6193 |    
IQ2_KT_KS   | 4.2772 | 6.7511 | | 12.0403|    | 8.0623 |    | 8.5156 |    | 5.5677 | 5.6964 |    | 9.2404 |    | 7.3630 |    | 6.6411 |
exl3-2.25_KS| 4.1763 | 6.6362 | | 13.5946|    | 8.0491 |    | 8.5495 |    | 5.5165 | 5.6529 |    |        |    |        |    |        |    
exl3-2.0_KS | 4.2786 | 6.8787 | | 12.5879|    | 8.0846 |    | 8.6236 |    | 5.5777 | 5.6954 |    |        |    |        |    |        |    

> official first, then self-generate with unsloth imatrix dat

In [None]:
# align with https://github.com/ikawrakow/ik_llama.cpp/discussions/63
assert os.path.exists(model)
# phi-4 need flash attention, therefore, we enable -fa flag
!llama.cpp/cuda_build/bin/llama-perplexity -m {model} -f wikitext-2-raw/wiki.test.raw -t 1 -ngl 100 -fa #--chunks 8 -b 8 -c 8

In [None]:
assert os.path.exists(model)
!llama.cpp/cuda_build/bin/llama-bench -m {model} -p 512 -n 128 -t 1 -ngl 100

In [None]:
assert os.path.exists(model)
!llama.cpp/cpu_build/bin/llama-bench -m {model} -p 4,8,32,128,512 -n 128 -t 8

## exllamav3

In [None]:
# !git clone https://github.com/turboderp-org/exllamav3 --depth 1
assert os.path.exists("exllamav3")
%cd exllamav3
!pip install -r requirements.txt
!pip install .
%cd ..

In [None]:
bpw = 2.25
quant_type = f"exl3-{str(bpw)}bpw"
model = f"models/{model_name}-{quant_type}"
# https://github.com/turboderp-org/exllamav3/blob/master/exllamav3/conversion/convert_model.py#L31
%cd exllamav3
!python convert.py -i {"../"+hf_model} -o {"../"+model} -w f"/tmp/{model_name}-{quant_type}" -b {str(bpw)} #-hb 4
%cd -

## safetensors

<details>
<summary>modification for llama and mistral series</summary>

```diff
diff --git a/exllamav3/util/rope.py b/exllamav3/util/rope.py
index 38cf670..0668269 100644
--- a/exllamav3/util/rope.py
+++ b/exllamav3/util/rope.py
@@ -307,6 +307,8 @@ class RoPE:
         norm_constant_bias: float = 0.0,
         inv_freq: torch.Tensor | None = None
     ):
+        q = q.reshape(*q.shape[:-1], -1, 2).transpose(-1, -2).flatten(-2)
+        k = k.reshape(*k.shape[:-1], -1, 2).transpose(-1, -2).flatten(-2)
         q = q.contiguous()
         if k is not None: k = k.contiguous()
         if positions is not None: positions = positions.contiguous()
```

</details>

In [None]:
from safetensors import safe_open
from safetensors.torch import save_file
from pathlib import Path
import json

# only llama and mistral series need to do this
def inverse_permute(w, name, num_heads, num_kv_heads):
    if 'q_proj' in name:
        dim3 = num_heads
    elif 'k_proj' in name:
        dim3 = num_kv_heads
    else:
        return w
    dim1, dim2 = w.shape
    return w.view(dim3, 2, dim1 // dim3 // 2, dim2).transpose(1, 2).reshape(dim1, dim2)

with open(f"{hf_model}/config.json") as f:
    config = json.load(f)

test_path = Path(hf_model)
safetensors_files = sorted([str(file) for file in list(test_path.glob("*.safetensors"))])

for model_path in safetensors_files:
    with safe_open(model_path, framework="pt") as f:
        tensor_names = list(f.keys())
        tensors_dict = {}
        for name in tensor_names:
            tensor = f.get_tensor(name)
            if 'text_config' in config:
                tensors_dict[name] = inverse_permute(tensor, name,
                    config['text_config']['num_attention_heads'], config['text_config']['num_key_value_heads'])
            else:
                tensors_dict[name] = inverse_permute(tensor, name, config['num_attention_heads'], config['num_key_value_heads'])
    save_file(tensors_dict, model_path)

In [None]:
from safetensors import safe_open
model_path = f"{model}/model.safetensors"
# model_path = 'models/Qwen3-8B-exl3-2.5bpw/model.safetensors'

with safe_open(model_path, framework="pt") as f:
    tensor_names = f.keys()
    byte_cnt = 0
    for name in tensor_names:
        if "model.layers" in name:
            tensor = f.get_tensor(name)
            byte_cnt += 2*tensor.numel()
byte_cnt/1024.0/1024.0

In [None]:
from safetensors import safe_open
model_path = f"{model}/model.safetensors"

with safe_open(model_path, framework="pt") as f:
    tensor_names = f.keys()
    for name in tensor_names:
        tensor = f.get_tensor(name)
        print(f"name: {name}")
        print(f"shape: {tensor.shape}")
        print(f"type: {tensor.dtype}")
        print("-" * 50)

## residual/speculative

In [None]:
# draft_type = "exl3-2.0bpw"
# draft_type = "exl3-2.25bpw"
# draft_type = "IQ2_KT"
draft_type = "IQ2_K"
# draft_type = "IQ2_M"

draft_model_prefix = f"models/{model_name}-{draft_type}"
model = draft_model_prefix + "-residual.gguf"
residual_model = draft_model_prefix + "-residual-IQ2_KS.gguf"
model

In [None]:
# Given residual weight corresponding to draft_type to quant it
assert os.path.exists(model)
# imatrix_file = ""
imatrix_file = f"--imatrix models/imatrix_unsloth-{model_name}.dat"
assert os.path.exists(f"models/imatrix_unsloth-{model_name}.dat")
!ik_llama.cpp/cuda_build/bin/llama-quantize {imatrix_file} --pure {model} {residual_model} IQ2_KS
!rm {model}
residual_model

In [None]:
target_model = draft_model_prefix + "-IQ2_KS.gguf"
imatrix_file = draft_model_prefix + "-IQ2_KS.imatrix"

calib_file = f"calibration_data/calibration_datav3.txt"
# modified ik_llama.cpp imatrix to dump activations
# Attention: the corresponding load_gguf_imatrix func in playgguf.ipynb requires 32-bit int range,
# therefore, the dumped activations imatrix file should smaller than 2GB
# therefore, the batch size should be tried to fit the above constraints
!ik_llama.cpp/cuda_build/bin/llama-imatrix -m {target_model} -f {calib_file} --output-file {imatrix_file} -ngl 100 -b 256 -c 256 --chunks 32 #-t 1
target_model

<details>
<summary>Mistral-Small-3.2-24B-Instruct-2506 prompt example</summary>

```
[SYSTEM_PROMPT]You are Mistral-Small-3.2-24B-Instruct-2506, a Large Language Model (LLM) created by Mistral AI, a French startup headquartered in Paris.
You power an AI assistant called Le Chat.
Your knowledge base was last updated on 2023-10-01.
The current date is 2025-10-01.

When you're not sure about some information or when the user's request requires up-to-date or specific data, you must use the available tools to fetch the information. Do not hesitate to use tools whenever they can provide a more accurate or complete response. If no relevant tools are available, then clearly state that you don't have the information and avoid making up anything.
If the user's question is not clear, ambiguous, or does not provide enough context for you to accurately answer the question, you do not try to answer it right away and you rather ask the user to clarify their request (e.g. "What are some good restaurants around me?" => "Where are you?" or "When is the next flight to Tokyo" => "Where do you travel from?").
You are always very attentive to dates, in particular you try to resolve dates (e.g. "yesterday" is 2023-09-30) and when asked about information at specific dates, you discard information that is at another date.
You follow these instructions in all languages, and always respond to the user in the language they use or request.
Next sections describe the capabilities that you have.

# WEB BROWSING INSTRUCTIONS

You cannot perform any web search or access internet to open URLs, links etc. If it seems like the user is expecting you to do so, you clarify the situation and ask the user to copy paste the text directly in the chat.

# MULTI-MODAL INSTRUCTIONS

You have the ability to read images, but you cannot generate images. You also cannot transcribe audio files or videos.
You cannot read nor transcribe audio files or videos.

# TOOL CALLING INSTRUCTIONS

You may have access to tools that you can use to fetch information or perform actions. You must use these tools in the following situations:

1. When the request requires up-to-date information.
2. When the request requires specific data that you do not have in your knowledge base.
3. When the request involves actions that you cannot perform without tools.

Always prioritize using tools to provide the most accurate and helpful response. If tools are not available, inform the user that you cannot perform the requested action at the moment.[/SYSTEM_PROMPT][INST]Write 20 sentences about summer.[/INST]

```
    
</details>

<details>
<summary>Magistral-Small-2509  prompt example</summary>

```
[SYSTEM_PROMPT]First draft your thinking process (inner monologue) until you arrive at a response. Format your response using Markdown, and use LaTeX for any mathematical equations. Write both your thoughts and the response in the same language as the input.

Your thinking process must follow the template below:[THINK]Your thoughts or/and draft, like working through an exercise on scratch paper. Be as casual and as long as you want until you are confident to generate the response. Use the same language as the input.[/THINK]Here, provide a self-contained response.[/SYSTEM_PROMPT][INST]Write 20 sentences about summer.[/INST]

```

</details>

In [None]:
import os

target_model = draft_model_prefix + ".gguf"
if not os.path.exists(target_model):
    target_model = draft_model_prefix + "-dequant.gguf"
draft_model  = draft_model_prefix + "-residual-IQ2_KS.gguf"

if "Qwen3" in model_name:
    prefix = "<|im_start|>user\n"
    suffix = " /no_think<|im_end|>\n<|im_start|>assistant\n"
    # suffix = " /think<|im_end|>\n<|im_start|>assistant\n"
elif "gemma-3" in model_name:
    prefix = "<start_of_turn>user\n\n"
    suffix = "<end_of_turn>\n<start_of_turn>model\n"
elif "Phi-4" in model_name:
    # https://huggingface.co/microsoft/Phi-4-reasoning-plus
    prefix = "<|im_start|>system<|im_sep|>\nYou are Phi, a language model trained by Microsoft to help users. Your role as an assistant involves thoroughly exploring questions through a systematic thinking process before providing the final precise and accurate solutions. This requires engaging in a comprehensive cycle of analysis, summarizing, exploration, reassessment, reflection, backtracing, and iteration to develop well-considered thinking process. Please structure your response into two main sections: Thought and Solution using the specified format:<think>{Thought section}</think>{Solution section}. In the Thought section, detail your reasoning process in steps. Each step should include detailed considerations such as analysing questions, summarizing relevant findings, brainstorming new ideas, verifying the accuracy of the current steps, refining any errors, and revisiting previous steps. In the Solution section, based on various attempts, explorations, and reflections from the Thought section, systematically present the final solution that you deem correct. The Solution section should be logical, accurate, and concise and detail necessary steps needed to reach the conclusion. Now, try to solve the following question through the above guidelines:<|im_end|>\n<|im_start|>user<|im_sep|>\n"
    suffix = "<|im_end|>\n<|im_start|>assistant<|im_sep|>\n"
elif "phi-4" in model_name:
    # https://huggingface.co/microsoft/phi-4
    prefix = "<|im_start|>user<|im_sep|>\n"
    suffix = "<|im_end|>\n<|im_start|>assistant<|im_sep|>\n"
elif "DeepSeek-R1-Distill" in model_name:
    prefix = "<｜User｜>"
    suffix = "<｜Assistant｜>"
elif "GLM" in model_name:
    prefix = "[gMASK]<sop><|system|>\nYou are an AI assistant named ChatGLM. You are developed based on the GLM-4 language model trained by Zhipu AI, and your task is to provide appropriate responses and support for users' questions and requests.<|user|>"
    suffix = "<|assistant|>\n<think>\n"
else: # Llama-3
    prefix = "<|start_header_id|>user<|end_header_id|>\n\n"
    suffix = "<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n"
# Mistral's prompt is too long, thus use file `-f`


LLama-70B | #Q1 | #Q2 | #Q3 | #Q4 | self-spec | #Q1 | #Q2 | #Q3 | #Q4
--- | --- |--- | --- | --- | --- | --- |--- | --- | --- 
exl3-2.0+IQ2_KS  |         |         |         |         | | 81.190% | 83.333% | 78.947% | 92.520%
exl3-2.25+IQ2_KS |         |         |         |         | | 75.652% | 77.049% | 84.856% | 93.595%
IQ2_KT+IQ2_KS    |         |         |         |         | | 80.357% | 80.319% | 69.382% | 92.453%
|
1BUD4+IQ2_K+IQ2_KS| 64.007% | 61.508% | 40.652% | 97.222%


DsLLama-70B| #Q1 | #Q2 | #Q3 | #Q4 | self-spec | #Q1 | #Q2 | #Q3 | #Q4
--- | --- |--- | --- | --- | --- | --- |--- | --- | --- 
exl3-2.0+IQ2_KS  |         |         |         |         | | 72.656% | 71.094% | 82.080% | 86.111%
exl3-2.25+IQ2_KS |         |         |         |         | | 73.101% | 74.224% | 77.826% | 85.845%
IQ2_KT+IQ2_KS    |         |         |         |         | | 71.887% | 69.427% | 77.232% | 74.516%

> LLama-49B failed with greedy decode

Qw32B | #Q1 | #Q2 | #Q3 | #Q4 | **think** | #Q1 | #Q2 | #Q3 | #Q4
--- | --- | --- |--- | --- | --- | --- | --- |--- | ---
exl3-2.0+IQ2_KS     | 70.062% | 63.816% | 79.167% | 81.311% | | 62.279% | 73.799% | 70.989% | 70.616%
exl3-2.25+IQ2_KS    | 71.067% | 75.781% | 78.540% | 81.010% | | 66.540% | 77.752% | 82.218% | 70.446%
IQ2_KT+IQ2_KS       | 75.758% | 65.000% | 73.047% | 79.167% | | 67.115% | 71.795% | 74.516% | 69.301%
IQ2_K+IQ2_KS        | 65.051% | 79.000% | 70.640% | 83.777% | | 66.459% | 75.508% | 77.333% | 69.649%
DeepSeek-R1-Distill |         |         |         |         | | 73.636% | 78.398% | 79.268% | 85.086%
|
0.6BUD6+IQ2_M+IQ2_KS| 32.500% | 34.211% | 36.607% | 59.884% | | 33.689% | 36.534% | 42.102% | 41.173%
1.7BUD4+IQ2_M+IQ2_KS| 69.612% | 46.795% | 46.615% | 69.612% | | 38.787% | 45.099% | 44.891% | 47.663%

<details>
<summary>original</summary>

Qw32B | #Q1 | #Q2 | #Q3 | #Q4 | **think** | #Q1 | #Q2 | #Q3 | #Q4
--- | --- | --- |--- | --- | --- | --- | --- |--- | ---
IQ2_KT+IQ2_KS     | 63.081% | 64.535% | 70.175% | 77.320% | | 59.979% | 67.224% | 65.813% | 57.556%
IQ2_K+IQ2_KS      | 62.240% | 68.056% | 66.106% | 80.469% | | 55.227% | 65.724% | 67.599% | 59.016%

</details>


Gm27B | #Q1 | #Q2 | #Q3 | #Q4 | self-spec | #Q1 | #Q2 | #Q3 | #Q4
--- | --- | --- |--- | --- | --- | --- | --- |--- | ---
exl3-2.0+IQ2_KS     |         |         |         |         | | 66.509% | 74.462% | 71.223% | 84.362%
exl3-2.25+IQ2_KS    |         |         |         |         | | 70.495% | 72.152% | 77.551% | 85.776%
IQ2_KT+IQ2_KS       | 65.094% | 55.275% | 68.990% | 82.668% | | 63.551% | 68.235% | 77.604% | 88.717%
IQ2_K+IQ2_KS        | 61.875% | 57.609% | 64.423% | 81.771% | | 73.762% | 67.614% | 67.188% | 86.413%
|
4BUD+IQ2_M+IQ2_KS   | 49.603% | 47.000% | 45.859% | 80.556%
1BUD+IQ2_M+IQ2_KS   | 45.149% | 36.224% | 35.789% | 73.654%

Mis24B | #Q1 | #Q2 | #Q3 | #Q4 | self-spec | #Q1 | #Q2 | #Q3 | #Q4
--- | --- | --- |--- | --- | --- | --- | --- |--- | ---
exl3-2.0+IQ2_KS     |         |         |         |         | | 68.519% | 72.059% | 75.000% | 91.265%
exl3-2.25+IQ2_KS    |         |         |         |         | | 72.603% | 75.000% | 78.289% | 92.262%
IQ2_KT+IQ2_KS       | 61.446% | 59.091% | 55.747% | 83.911% | | 69.408% | 63.235% | 73.750% | 91.755%
IQ2_K+IQ2_KS        | 61.765% | 51.724% | 62.153% | 83.158% | | 75.938% | 88.333% | 69.366% | 86.340%

Mag24B | #Q1 | #Q2 | #Q3 | #Q4 | self-spec | #Q1 | #Q2 | #Q3 | #Q4
--- | --- | --- |--- | --- | --- | --- | --- |--- | ---
exl3-2.0+IQ2_KS     |         |         |         |         | | 81.325% | 74.550% | 77.872% | 84.255%
exl3-2.25+IQ2_KS    |         |         |         |         | | 84.802% | 68.023% | 78.528% | 84.255%
IQ2_KT+IQ2_KS       | 70.539% | 64.890% | 64.948% | 69.139% | | 78.313% | 64.236% | 75.293% | 85.086%
IQ2_K+IQ2_KS        | 78.526% | 65.927% | 66.939% | 73.654% | | 87.731% | 72.527% | 79.370% | 80.556%

Qw14B | #Q1 | #Q2 | #Q3 | #Q4 | think | #Q1 | #Q2 | #Q3 | #Q4
--- | --- | --- |--- | --- | --- | --- | --- |--- | ---
IQ2_KT+IQ2_KS       | 69.512% | 66.892% | 73.397% | 84.593% | | 65.857% | 75.704% | 74.131% | 79.796%
IQ2_K+IQ2_KS        | 68.072% | 82.500% | 80.357% | 90.000% | | 69.581% | 68.562% | 74.805% | 79.898%
DeepSeek-R1-Distill |         |         |         |         | | 67.688% | 73.732% | 84.848% | 73.180%
UD-IQ2_M+IQ2_KS     | 69.767% | 84.167% | 85.417% | 88.043% | | 70.225% | 75.787% | 76.285% | 84.255%

<details>
<summary>original</summary>

Qw14B | #Q1 | #Q2 | #Q3 | #Q4 | think | #Q1 | #Q2 | #Q3 | #Q4
--- | --- | --- |--- | --- | --- | --- | --- |--- | ---
IQ2_KT+IQ2_KS     | 58.889% | 47.857% | 62.360% | 86.765% | | 65.438% | 67.323% | 65.901% | 72.064%
IQ2_K+IQ2_KS      | 59.267% | 69.565% | 70.066% | 89.412% | | 68.704% | 64.912% | 64.912% | 73.558%
UD-IQ2_M+IQ2_KS   | 68.391% | 61.538% | 76.587% | 83.854% | | 61.529% | 72.064% | 67.025% | 79.268%

</details>

Phi14B | #Q1 | #Q2 | #Q3 | #Q4 | self-spec | #Q1 | #Q2 | #Q3 | #Q4
--- | --- | --- |--- | --- | --- | --- | --- |--- | ---
IQ2_KT+IQ2_KS   | 59.906% | 61.538% | 82.955% | 84.280% | | 72.500% | 61.250% | 92.647% | 91.250%
IQ2_K+IQ2_KS    | 59.167% | 53.846% | 86.429% | 87.308% | | 68.333% | 67.391% | 89.286% | 91.929%
UD-IQ2_M+IQ2_KS | 60.052% | 66.935% | 77.273% | 82.222% | | 75.000% | 76.852% | 82.031% | 90.833%

> Phi14BR failed with greedy decode


In [None]:
prompt = prefix+"Write 20 sentences about summer."+suffix #Q1
# prompt = prefix+"Who was the first prime minister of Britain?"+suffix #Q2
# prompt = prefix+"How many persons are needed to power a 800W toaster?"+suffix #Q3
# prompt = prefix+"Write the Quicksort algorithm in TypeScript."+suffix #Q4

# https://github.com/ggml-org/llama.cpp/discussions/10466#discussioncomment-11501175
!llama.cpp/cuda_debug/bin/llama-speculative -m {target_model} -md {draft_model} \
-p "{prompt}" -c 2048 -cd 2048 -n 1024 --seed 42 --draft-max 4 --draft-min 4 \
--top-k 20 --temp 0.6 --top-p 0.95 --draft-p-min 0.0 -t 8 -fa --color -ngl 100 -ngld 100 #--sampling-seq k
draft_model
print(prompt)

In [None]:
import os

target_model = draft_model_prefix + "-IQ2_KS.gguf"
draft_model  = draft_model_prefix + ".gguf"
# target_model = f"models/Llama-3.3-70B-Instruct-IQ2_K-IQ2_KS.gguf"
# draft_model  = f"models/Llama-3.2-1B-Instruct-UD-Q4_K_XL.gguf"

if not os.path.exists(draft_model):
    draft_model = draft_model_prefix + "-dequant.gguf"

prompt = prefix+"Write 20 sentences about summer."+suffix #Q1

!llama.cpp/cuda_build/bin/llama-speculative-simple -m {target_model} -md {draft_model} \
-p "{prompt}" -c 2048 -cd 2048 -n 1024 --sampling-seq k --seed 42 --draft-max 4 --draft-min 4 \
--top-k 1 --draft-p-min 0.0 -t 8 -fa --color -ngl 100 -ngld 100
draft_model

In [None]:
!llama.cpp/cuda_build/bin/llama-cli -m {target_model} -p "{prompt}" -c 2048 -n 512 --top-k 1 --seed 42 -ngl 100 -no-cnv 2>/dev/null
target_model

In [None]:
!llama.cpp/cuda_build/bin/llama-cli -m {draft_model}  -p "{prompt}" -c 2048 -n 512 --top-k 1 --seed 42 -ngl 100 -no-cnv 2>/dev/null
draft_model