In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
torch.manual_seed(0)
device = torch.device(f'cuda:{1}' if torch.cuda.is_available() else 'cpu')
torch.cuda.set_device(device) 

# Load model and tokenizer
model_id = "your model"
model = AutoModelForCausalLM.from_pretrained(model_id).to(device)
tokenizer = AutoTokenizer.from_pretrained(model_id)

# Print model size
print(f"Model size: {model.get_memory_footprint():,} bytes")

Model size: 4,943,257,728 bytes


 1단계: Weight 추출 (Layer별로 Linear 계층만)

In [2]:
def get_linear_weights(model):
    """
    LoRA / LoQT 적용 가능한 nn.Linear weight만 추출
    """
    weight_list = []
    layer_names = []

    for name, module in model.named_modules():
        if isinstance(module, torch.nn.Linear):
            if hasattr(module, 'weight') and module.weight is not None:
                weight_list.append(module.weight.detach().cpu())
                layer_names.append(name)
    return weight_list, layer_names

weights_fp, names = get_linear_weights(model)
print(f"✅ 총 {len(weights_fp)}개의 Linear layer weight 추출됨")

✅ 총 113개의 Linear layer weight 추출됨


 2단계: Weight 양자화 (Absmax 기준)

In [3]:
def absmax_quantize(weight):
    max_val = weight.abs().max()
    scale = max_val / 127
    quant = torch.round(weight / scale).clamp(-127, 127)
    dequant = quant * scale
    return dequant, scale

weights_quant = []
for w in weights_fp:
    dequant, _ = absmax_quantize(w)
    weights_quant.append(dequant)


 3단계: Low-Rank 보정 (LoQT 방식 SVD)

In [4]:
def loqt_restore(fp_weight, quant_weight, rank=8):
    delta = fp_weight - quant_weight
    original_shape = delta.shape
    delta_matrix = delta.view(original_shape[0], -1)

    # SVD
    U, S, Vh = torch.linalg.svd(delta_matrix, full_matrices=False)
    P = U[:, :rank]                           # P: m x r
    S_r = torch.diag(S[:rank])
    B = S_r @ Vh[:rank, :]                    # B: r x n
    delta_approx = P @ B
    W_restored = quant_weight + delta_approx
    return W_restored.view(original_shape), P, B


 4단계: 전체 레이어 복원 루프

In [5]:
weights_restored = []
PB_pairs = []

for i in range(len(weights_fp)):
    try:
        W_rec, P, B = loqt_restore(weights_fp[i], weights_quant[i], rank=8)
        weights_restored.append(W_rec)
        PB_pairs.append((P, B))
    except Exception as e:
        print(f"[Error] Layer {i} - {names[i]}: {e}")
        weights_restored.append(weights_quant[i])
        PB_pairs.append((None, None))


In [6]:
for i in range(len(weights_fp)):
    mse_q = torch.mean((weights_fp[i] - weights_quant[i])**2).item()
    mse_r = torch.mean((weights_fp[i] - weights_restored[i])**2).item()
    print(f"[{names[i]}] Quant MSE: {mse_q:.6f}, Restored MSE: {mse_r:.6f}")


[model.layers.0.self_attn.q_proj] Quant MSE: 0.000003, Restored MSE: 0.000003
[model.layers.0.self_attn.k_proj] Quant MSE: 0.000002, Restored MSE: 0.000002
[model.layers.0.self_attn.v_proj] Quant MSE: 0.000000, Restored MSE: 0.000000
[model.layers.0.self_attn.o_proj] Quant MSE: 0.000001, Restored MSE: 0.000001
[model.layers.0.mlp.gate_proj] Quant MSE: 0.000002, Restored MSE: 0.000002
[model.layers.0.mlp.up_proj] Quant MSE: 0.000000, Restored MSE: 0.000000
[model.layers.0.mlp.down_proj] Quant MSE: 0.000002, Restored MSE: 0.000002
[model.layers.1.self_attn.q_proj] Quant MSE: 0.000001, Restored MSE: 0.000001
[model.layers.1.self_attn.k_proj] Quant MSE: 0.000000, Restored MSE: 0.000000
[model.layers.1.self_attn.v_proj] Quant MSE: 0.000000, Restored MSE: 0.000000
[model.layers.1.self_attn.o_proj] Quant MSE: 0.000002, Restored MSE: 0.000001
[model.layers.1.mlp.gate_proj] Quant MSE: 0.000004, Restored MSE: 0.000004
[model.layers.1.mlp.up_proj] Quant MSE: 0.000007, Restored MSE: 0.000006
[mode

복원된 weight → 모델 반영

In [7]:
from copy import deepcopy

# 원본 모델 복사 (보정 모델 만들기)
model_restored = deepcopy(model)

linear_idx = 0

for name, module in model_restored.named_modules():
    if isinstance(module, torch.nn.Linear):
        if linear_idx < len(weights_restored) and PB_pairs[linear_idx][0] is not None:
            restored_weight = weights_restored[linear_idx]
            module.weight.data.copy_(restored_weight.to(module.weight.dtype))
        linear_idx += 1

print(f"✅ 복원된 weight를 {linear_idx}개의 Linear layer에 반영 완료")


✅ 복원된 weight를 113개의 Linear layer에 반영 완료


In [8]:
total_mse_quant = 0.0
total_mse_restored = 0.0
count = 0

for i in range(len(weights_fp)):
    if weights_fp[i].shape != weights_quant[i].shape:
        continue
    mse_q = torch.mean((weights_fp[i] - weights_quant[i]) ** 2).item()
    mse_r = torch.mean((weights_fp[i] - weights_restored[i]) ** 2).item()
    total_mse_quant += mse_q
    total_mse_restored += mse_r
    count += 1

avg_mse_q = total_mse_quant / count
avg_mse_r = total_mse_restored / count

print(f"\n📊 전체 평균 MSE")
print(f"   - Quantized 평균 MSE   : {avg_mse_q:.6f}")
print(f"   - Restored  평균 MSE   : {avg_mse_r:.6f}")



📊 전체 평균 MSE
   - Quantized 평균 MSE   : 0.000001
   - Restored  평균 MSE   : 0.000001


In [9]:
# Store original weights
weights = [param.data.cpu().clone() for param in model.parameters()]

# Create model to quantize
model_abs = deepcopy(model)

# Quantize all model weights
weights_abs = []
for param in model_abs.parameters():
    _, dequantized = absmax_quantize(param.data)
    param.data = dequantized
    weights_abs.append(dequantized)

In [15]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(model_id)  # or your model name
prompt = "Tell me a fun fact about the moon."
inputs = tokenizer(prompt, return_tensors="pt").to("cuda" if torch.cuda.is_available() else "cpu")


In [16]:
model.eval()
model_abs.eval()
model_restored.eval()


LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(128256, 2048)
    (layers): ModuleList(
      (0-15): 16 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (k_proj): Linear(in_features=2048, out_features=512, bias=False)
          (v_proj): Linear(in_features=2048, out_features=512, bias=False)
          (o_proj): Linear(in_features=2048, out_features=2048, bias=False)
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=2048, out_features=8192, bias=False)
          (up_proj): Linear(in_features=2048, out_features=8192, bias=False)
          (down_proj): Linear(in_features=8192, out_features=2048, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((2048,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((2048,), eps=1e-05)
      )
    )
    (norm): LlamaRMSNorm((2048,), eps=1e-05)
    (rotary_emb):

In [17]:
def generate_text(model, inputs, max_new_tokens=30):
    with torch.no_grad():
        output = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            do_sample=False
        )
    return tokenizer.decode(output[0], skip_special_tokens=True)

print("\n🧠 🔍 세 모델 출력 비교:")
print("-" * 40)
print("🧪 [FP Model]")
print(generate_text(model, inputs))

print("-" * 40)
print("🧪 [Quantized Model]")
print(generate_text(model_abs, inputs))

print("-" * 40)
print("🧪 [Restored Model]")
print(generate_text(model_restored, inputs))


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.



🧠 🔍 세 모델 출력 비교:
----------------------------------------
🧪 [FP Model]


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Tell me a fun fact about the moon. I’m going to guess you’re going to say it’s the only natural satellite in the solar system. Well, I’m here to tell you that
----------------------------------------
🧪 [Quantized Model]


RuntimeError: 'weight' must be 2-D

In [19]:
from copy import deepcopy

weights_fp = []
names = []

for name, module in model.named_modules():
    if isinstance(module, torch.nn.Linear) and module.weight is not None:
        weights_fp.append(module.weight.detach().cpu())
        names.append(name)
weights_quant = []
for w in weights_fp:
    q, _ = absmax_quantize(w)
    weights_quant.append(q)
weights_restored = []
for i in range(len(weights_fp)):
    try:
        restored, _, _ = loqt_restore(weights_fp[i], weights_quant[i], rank=8)
        weights_restored.append(restored)
    except Exception as e:
        print(f"[{names[i]}] 보정 실패: {e}")
        weights_restored.append(weights_quant[i])  # fallback



model_restored = deepcopy(model)
i = 0

for name, module in model_restored.named_modules():
    if isinstance(module, torch.nn.Linear):
        module.weight.data.copy_(weights_restored[i].to(module.weight.dtype))
        i += 1


OutOfMemoryError: CUDA out of memory. Tried to allocate 64.00 MiB. GPU 1 has a total capacity of 23.52 GiB of which 57.88 MiB is free. Process 3201349 has 920.00 MiB memory in use. Process 3250141 has 920.00 MiB memory in use. Including non-PyTorch memory, this process has 21.48 GiB memory in use. Of the allocated memory 21.03 GiB is allocated by PyTorch, and 6.65 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)