In [1]:
import torch
torch.cuda.is_available()

True

In [1]:
import torch
def absmax_quantize(X):
    # Calculate scale
    scale = 127 / torch.max(torch.abs(X))

    # Quantize
    X_quant = (scale * X).round()

    # Dequantize
    X_dequant = X_quant / scale

    return X_quant.to(torch.int8), X_dequant

In [2]:
def zeropoint_quantize(X):
    # Calculate value range (denominator)
    x_range = torch.max(X) - torch.min(X)
    x_range = 1 if x_range == 0 else x_range

    # Calculate scale
    scale = 255 / x_range

    # Shift by zero-point
    zeropoint = (-scale * torch.min(X) - 128).round()

    # Scale and round the inputs
    X_quant = torch.clip((X * scale + zeropoint).round(), -128, 127)

    # Dequantize
    X_dequant = (X_quant - zeropoint) / scale

    return X_quant.to(torch.int8), X_dequant

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
torch.manual_seed(0)
device = torch.device(f'cuda:{1}' if torch.cuda.is_available() else 'cpu')
torch.cuda.set_device(device) 

# Load model and tokenizer
model_id = "your model"
model = AutoModelForCausalLM.from_pretrained(model_id).to(device)
tokenizer = AutoTokenizer.from_pretrained(model_id)

# Print model size
print(f"Model size: {model.get_memory_footprint():,} bytes")

Model size: 4,943,257,728 bytes


In [4]:
# Extract weights of the first layer
weights = model.model.layers[0].self_attn.q_proj.weight

print("Original weights:")
print(weights)
print(weights.size())

# Quantize layer using absmax quantization
weights_abs_quant, _ = absmax_quantize(weights)
print("\nAbsmax quantized weights:")
print(weights_abs_quant)
print(weights_abs_quant.size())

# Quantize layer using absmax quantization
weights_zp_quant, _ = zeropoint_quantize(weights)
print("\nZero-point quantized weights:")
print(weights_zp_quant)
print(weights_zp_quant.size())

# print("\nminus :")
# minus=torch.sub(weights,weights_abs_quant)
# print(minus)
# print(minus.size())

# print("\n(W-W_q)+W_q")
# plus=torch.add(minus,weights_abs_quant)
# print(plus)
# print(plus.size())

Original weights:
Parameter containing:
tensor([[-0.0183,  0.0071,  0.0219,  ..., -0.0070, -0.0089,  0.0149],
        [ 0.0112,  0.0593,  0.0630,  ..., -0.0334, -0.0148,  0.0058],
        [ 0.0182,  0.0141,  0.0361,  ..., -0.0432, -0.0388, -0.0233],
        ...,
        [ 0.0305,  0.0289,  0.0801,  ..., -0.0767, -0.0311, -0.0334],
        [ 0.0242, -0.0325,  0.0369,  ..., -0.0123, -0.0269, -0.0151],
        [-0.0264, -0.0498, -0.0210,  ...,  0.0601,  0.0130, -0.0007]],
       device='cuda:1', requires_grad=True)
torch.Size([2048, 2048])

Absmax quantized weights:
tensor([[ -3,   1,   4,  ...,  -1,  -2,   3],
        [  2,  11,  11,  ...,  -6,  -3,   1],
        [  3,   3,   6,  ...,  -8,  -7,  -4],
        ...,
        [  5,   5,  14,  ..., -14,  -6,  -6],
        [  4,  -6,   7,  ...,  -2,  -5,  -3],
        [ -5,  -9,  -4,  ...,  11,   2,   0]], device='cuda:1',
       dtype=torch.int8)
torch.Size([2048, 2048])

Zero-point quantized weights:
tensor([[ 6, 11, 14,  ...,  9,  8, 13],
  

In [6]:
import torch

def absmax_quantize(weight):
    max_val = weight.abs().max()
    scale = max_val / 127
    quant = torch.round(weight / scale).clamp(-127, 127)
    dequant = quant * scale
    return dequant, scale

def low_rank_restore(fp_weight, quant_weight, rank=8):
    if fp_weight.numel() < 2 or quant_weight.numel() < 2:
        raise ValueError("Too small tensor for SVD")

    original_shape = fp_weight.shape
    fp_matrix = fp_weight.view(original_shape[0], -1)
    quant_matrix = quant_weight.view(original_shape[0], -1)

    delta = fp_matrix - quant_matrix
    U, S, Vh = torch.linalg.svd(delta, full_matrices=False)
    U_r = U[:, :rank]
    S_r = torch.diag(S[:rank])
    V_r = Vh[:rank, :]
    delta_approx = U_r @ S_r @ V_r
    W_approx = quant_matrix + delta_approx

    return W_approx.view(original_shape), delta_approx.view(original_shape)


# 1. Extract original weight
fp_weight = model.model.layers[0].self_attn.q_proj.weight.detach().cpu()

# 2. Quantize
quant_weight, scale_abs = absmax_quantize(fp_weight)

# 3. Restore
W_restored, delta_approx = low_rank_restore(fp_weight, quant_weight, rank=16)

# 4. Compare MSE
mse_original = torch.mean((fp_weight - quant_weight) ** 2).item()
mse_recovered = torch.mean((fp_weight - W_restored) ** 2).item()

print(f"원래 quantization MSE: {mse_original:.6f}")
print(f"보정 후 MSE (Low-rank {16}): {mse_recovered:.6f}")

# 5. Shape 확인
print(f"fp_weight.shape     = {fp_weight.shape}")
print(f"W_restored.shape    = {W_restored.shape}")


원래 quantization MSE: 0.000003
보정 후 MSE (Low-rank 16): 0.000003
fp_weight.shape     = torch.Size([2048, 2048])
W_restored.shape    = torch.Size([2048, 2048])


In [None]:
import torch
from copy import deepcopy

# ---------- Quantization Functions ----------
def absmax_quantize(weight):
    max_val = weight.abs().max()
    scale = max_val / 127
    quant = torch.round(weight / scale).clamp(-127, 127)
    dequant = quant * scale
    return dequant, scale

# 0

def low_rank_restore(fp_weight, quant_weight, rank=8):
    if fp_weight.numel() < 2 or quant_weight.numel() < 2:
        raise ValueError("Too small tensor for SVD")

    original_shape = fp_weight.shape
    fp_matrix = fp_weight.view(original_shape[0], -1)
    quant_matrix = quant_weight.view(original_shape[0], -1)

    delta = fp_matrix - quant_matrix
    U, S, Vh = torch.linalg.svd(delta, full_matrices=False)
    U_r = U[:, :rank]
    S_r = torch.diag(S[:rank])
    V_r = Vh[:rank, :]
    delta_approx = U_r @ S_r @ V_r
    W_approx = quant_matrix + delta_approx

    return W_approx.view(original_shape), delta_approx.view(original_shape)


# ---------- Step 1: 원본 weight 저장 ----------
# 1. Extract original weight
fp_weight = [param.data.cpu().clone() for param in model.parameters()]

# ---------- Step 2: absmax 양자화 모델 ----------
model_abs = deepcopy(model)
weights_abs = []

for param in model_abs.parameters():
    _, dequantized = absmax_quantize(param.data)
    param.data = dequantized
    weights_abs.append(dequantized)

# # ---------- Step 3: zero-point 양자화 모델 ----------
# model_zp = deepcopy(model)
# weights_zp = []

# for param in model_zp.parameters():
#     _, dequant = zeropoint_quantize(param.data.cpu())
#     param.data.copy_(dequant.to(dtype=param.data.dtype))
#     weights_zp.append(dequant.clone())

# ---------- Step 4: low-rank 보정 모델 ----------
model_restored = deepcopy(model)
weights_restored=[]

for i, param in enumerate(model_abs.parameters()):
    fp = fp_weight[i]        # 저장된 full-precision weight
    quant = weights_abs[i]    # 저장된 absmax quantized weight

    try:
        restored, _ = low_rank_restore(fp, quant, rank=8)
        param.data.copy_(restored.to(dtype=param.data.dtype))
    except Exception as e:
        print(f"[Error] Layer {i}: {e}")



[Error] Layer 0: Too small tensor for SVD
[Error] Layer 1: Too small tensor for SVD
[Error] Layer 2: Too small tensor for SVD
[Error] Layer 3: Too small tensor for SVD
[Error] Layer 4: Too small tensor for SVD
[Error] Layer 5: Too small tensor for SVD
[Error] Layer 6: Too small tensor for SVD
[Error] Layer 7: Too small tensor for SVD
[Error] Layer 8: Too small tensor for SVD
[Error] Layer 9: Too small tensor for SVD
[Error] Layer 10: Too small tensor for SVD
[Error] Layer 11: Too small tensor for SVD
[Error] Layer 12: Too small tensor for SVD
[Error] Layer 13: Too small tensor for SVD
[Error] Layer 14: Too small tensor for SVD
[Error] Layer 15: Too small tensor for SVD
[Error] Layer 16: Too small tensor for SVD
[Error] Layer 17: Too small tensor for SVD
[Error] Layer 18: Too small tensor for SVD
[Error] Layer 19: Too small tensor for SVD
[Error] Layer 20: Too small tensor for SVD
[Error] Layer 21: Too small tensor for SVD
[Error] Layer 22: Too small tensor for SVD
[Error] Layer 23: Too

: 

In [15]:
print(f"param.shape = {param.data.shape}")
print(f"restored.shape = {restored.shape}")


param.shape = torch.Size([128256, 2048])
restored.shape = torch.Size([2048, 2048])


In [None]:
def generate_text(model, input_text, max_length=50):
    input_ids = tokenizer.encode(input_text, return_tensors='pt').to(device)
    output = model.generate(inputs=input_ids,
                            max_length=max_length,
                            do_sample=True,
                            top_k=30,
                            pad_token_id=tokenizer.eos_token_id,
                            attention_mask=input_ids.new_ones(input_ids.shape))
    return tokenizer.decode(output[0], skip_special_tokens=True)

# Generate text with original and quantized models
original_text = generate_text(model, "I have a dream")
absmax_text   = generate_text(model_abs, "I have a dream")
# zp_text       = generate_text(model_zp, "I have a dream")

print(f"Original model:\n{original_text}")
print("-" * 50)
print(f"Absmax model:\n{absmax_text}")
print("-" * 50)
# print(f"Zeropoint model:\n{zp_text}")

Original model:
I have a dream that one day the people of the world will live together in a world without race prejudice. I have a dream that one day, the sons of former slaves and the sons of former slave owners will be able to sit down together
--------------------------------------------------
Absmax model:
I have a dream that one day every child will be able to sleep in a safe, secure, and comfortable bed, that every child will have a chance to grow up in a safe, secure, and comfortable home, that every child will have
--------------------------------------------------


: 