## Tens Decomp

In [3]:
from mmlu_benchmark import MMLUEvaluator
from transformers import AutoModelForCausalLM, AutoTokenizer


import torch.nn as nn
import torch

from tqdm import tqdm

import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def dequantize_awq_layer(awq_layer):
    
    scale_zeros = awq_layer.qzeros
    scales = awq_layer.scales
    qweight = awq_layer.qweight
    
    # Константы распаковки
    bits = 4
    group_size = 128
    
    # В реальности для распаковки проще всего прогнать Identity матрицу через слой,
    # если нет доступа к низкоуровневым функциям unpack.
    # Это потребляет VRAM, но надежно работает с любой версией кернела.
    
    # Создаем единичную матрицу размером (In_Features, In_Features)
    # Qwen 7B hidden size: 3584 (или 4096 в зависимости от версии)
    in_features = awq_layer.in_features
    
    # Прогоняем батчами, чтобы не взорвать память
    weights = []
    batch_size = 128 
    
    identity = torch.eye(in_features, dtype=torch.float16, device=awq_layer.device)
    
    with torch.no_grad():
        for i in range(0, in_features, batch_size):
            batch = identity[i : i + batch_size]
            # AWQ layer forward pass: x @ W.T
            # Получаем строки весов
            out = awq_layer(batch)
            weights.append(out)
            
    # Собираем полную матрицу W (Batch/In, Out)
    W_fp16 = torch.cat(weights, dim=0)
    
    # Transpose, чтобы получить форму (Out, In) как в nn.Linear.weight
    return W_fp16.t()

def apply_svd_to_head(model, rank_ratio=0.1):
    lm_head = model.lm_head
    print(f"Original lm_head type: {type(lm_head)}")
    
    # 1. Деквантизация
    # Если модель загружена через AutoAWQForCausalLM, слой будет типа WQLinear
    if not isinstance(lm_head, nn.Linear):
        # Пытаемся деквантовать
        W = dequantize_awq_layer(lm_head)
    else:
        # Если вдруг FP16
        W = lm_head.weight.data
        
    print(f"Recovered weight shape: {W.shape}")
    
    # 2. SVD
    print("Computing SVD...")
    # W shape: (Vocab, Hidden) -> (151936, 3584)
    # Используем float32 для точности SVD, потом вернем в fp16
    U, S, Vh = torch.linalg.svd(W.float(), full_matrices=False)
    
    # 3. Обрезка ранга
    full_rank = len(S)
    target_rank = int(full_rank * rank_ratio)
    print(f"Reducing rank from {full_rank} to {target_rank}")
    
    U_r = U[:, :target_rank]
    S_r = S[:target_rank]
    Vh_r = Vh[:target_rank, :]
    
    # 4. Создаем два слоя
    # W ~ U @ S @ Vh
    # Head разбивается на:
    # Layer 1 (Project Down): Hidden -> Rank
    # Layer 2 (Project Up): Rank -> Vocab
    
    sqrt_S = torch.diag(torch.sqrt(S_r))
    
    # Down projection (B): Hidden -> Rank
    # B = sqrt(S) @ Vh  -> shape (Rank, Hidden)
    W_down = torch.matmul(sqrt_S, Vh_r).to(W.device).half()
    
    # Up projection (A): Rank -> Vocab
    # A = U @ sqrt(S) -> shape (Vocab, Rank)
    W_up = torch.matmul(U_r, sqrt_S).to(W.device).half()
    
    # Создаем модули
    # Отключаем bias, так как в lm_head его обычно нет
    linear_down = nn.Linear(W.shape[1], target_rank, bias=False)
    linear_up = nn.Linear(target_rank, W.shape[0], bias=False)
    
    linear_down.weight.data = W_down
    linear_up.weight.data = W_up
    
    # 5. Замена в модели
    new_head = nn.Sequential(linear_down, linear_up)
    model.lm_head = new_head
    print("lm_head replaced with SVD approximation.")
    
    return model


In [3]:
q_model = AutoModelForCausalLM.from_pretrained(
    "./models/Qwen3-8B-AWQ-MMLU", 
    torch_dtype=torch.float16,
    device_map="auto",
    trust_remote_code=True
)
tokenizer = AutoTokenizer.from_pretrained("./models/Qwen3-8B-AWQ-MMLU", trust_remote_code=True)

I have left this message as the final dev message to help you transition.

Important Notice:
- AutoAWQ is officially deprecated and will no longer be maintained.
- The last tested configuration used Torch 2.6.0 and Transformers 4.51.3.
- If future versions of Transformers break AutoAWQ compatibility, please report the issue to the Transformers project.

Alternative:
- AutoAWQ has been adopted by the vLLM Project: https://github.com/vllm-project/llm-compressor

For further inquiries, feel free to reach out:
- X: https://x.com/casper_hansen_
- LinkedIn: https://www.linkedin.com/in/casper-hansen-804005170/

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.13it/s]


In [None]:
apply_svd_to_head(q_model, rank_ratio=0.8)

Original lm_head type: <class 'torch.nn.modules.linear.Linear'>
Recovered weight shape: torch.Size([151936, 4096])
Computing SVD...
Reducing rank from 4096 to 3276
lm_head replaced with SVD approximation.


Qwen3ForCausalLM(
  (model): Qwen3Model(
    (embed_tokens): Embedding(151936, 4096)
    (layers): ModuleList(
      (0-35): 36 x Qwen3DecoderLayer(
        (self_attn): Qwen3Attention(
          (q_proj): WQLinear_GEMM(in_features=4096, out_features=4096, bias=False, w_bit=4, group_size=128)
          (k_proj): WQLinear_GEMM(in_features=4096, out_features=1024, bias=False, w_bit=4, group_size=128)
          (v_proj): WQLinear_GEMM(in_features=4096, out_features=1024, bias=False, w_bit=4, group_size=128)
          (o_proj): WQLinear_GEMM(in_features=4096, out_features=4096, bias=False, w_bit=4, group_size=128)
          (q_norm): Qwen3RMSNorm((128,), eps=1e-06)
          (k_norm): Qwen3RMSNorm((128,), eps=1e-06)
        )
        (mlp): Qwen3MLP(
          (gate_proj): WQLinear_GEMM(in_features=4096, out_features=12288, bias=False, w_bit=4, group_size=128)
          (up_proj): WQLinear_GEMM(in_features=4096, out_features=12288, bias=False, w_bit=4, group_size=128)
          (down_proj)

In [8]:
q_model.save_pretrained("Qwen3-8B-AWQ-SVD_lm_head-MMLU")
tokenizer.save_pretrained("Qwen3-8B-AWQ-SVD_lm_head-MMLU")
torch.save(q_model, "Qwen3-8B-AWQ-SVD_lm_head-MMLU/model.pt")

In [4]:
q_model = torch.load("./models/Qwen3-8B-AWQ-SVD_lm_head-MMLU/model.pt", weights_only=False)
tokenizer = AutoTokenizer.from_pretrained("./models/Qwen3-8B-AWQ-SVD_lm_head-MMLU")

evaluator = MMLUEvaluator(
    model=q_model, tokenizer=tokenizer, device="cuda",
    split="dev", per_subject_samples=10, seed=42, model_name="svd_qwen_mlp_2048rnk"
)

_ = evaluator.evaluate()

I have left this message as the final dev message to help you transition.

Important Notice:
- AutoAWQ is officially deprecated and will no longer be maintained.
- The last tested configuration used Torch 2.6.0 and Transformers 4.51.3.
- If future versions of Transformers break AutoAWQ compatibility, please report the issue to the Transformers project.

Alternative:
- AutoAWQ has been adopted by the vLLM Project: https://github.com/vllm-project/llm-compressor

For further inquiries, feel free to reach out:
- X: https://x.com/casper_hansen_
- LinkedIn: https://www.linkedin.com/in/casper-hansen-804005170/



  Загружена dev выборка
  Всего вопросов в dev выборке: 285
  Количество предметов: 57
Инициализация завершена. Эксперимент: svd_qwen_mlp_2048rnk_dev_20251222_221600

Эксперимент: svd_qwen_mlp_2048rnk_dev_20251222_221600
Модель: svd_qwen_mlp_2048rnk
Всего вопросов в dev: 285
Количество предметов: 57
Промпт стиль: zero-shot


57it [01:13,  1.29s/it]

ОБЩАЯ ТОЧНОСТЬ: 0.6982 (69.82%)
Правильных ответов: 199 из 285
Оценено предметов: 57
Пиковое потребление VRAM: 5787.29 MB



