In [1]:
import torch

def absmax_quantize(X):
    # Calculate scale
    scale = 127 / torch.max(torch.abs(X))

    # Quantize
    X_quant = (scale * X).round()

    # Dequantize
    X_dequant = X_quant / scale

    return X_quant.to(torch.int8), X_dequant

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
torch.manual_seed(0)

device = "cuda" if torch.cuda.is_available() else "cpu"

# Load model and tokenizer
model_id = "your_model_name"
model = AutoModelForCausalLM.from_pretrained(model_id).to(device)
tokenizer = AutoTokenizer.from_pretrained(model_id)

# Print model size
print(f"Model size: {model.get_memory_footprint():,} bytes")



Model size: 4,943,257,728 bytes


In [5]:
# Extract weights of the first layer
weights = model.model.layers[0].self_attn.q_proj.weight.data

print("Original weights:")
print(weights)

# Quantize layer using absmax quantization
weights_abs_quant, _ = absmax_quantize(weights)
print("\nAbsmax quantized weights:")
print(weights_abs_quant)

Original weights:
tensor([[-0.0183,  0.0071,  0.0219,  ..., -0.0070, -0.0089,  0.0149],
        [ 0.0112,  0.0593,  0.0630,  ..., -0.0334, -0.0148,  0.0058],
        [ 0.0182,  0.0141,  0.0361,  ..., -0.0432, -0.0388, -0.0233],
        ...,
        [ 0.0305,  0.0289,  0.0801,  ..., -0.0767, -0.0311, -0.0334],
        [ 0.0242, -0.0325,  0.0369,  ..., -0.0123, -0.0269, -0.0151],
        [-0.0264, -0.0498, -0.0210,  ...,  0.0601,  0.0130, -0.0007]],
       device='cuda:0')

Absmax quantized weights:
tensor([[ -3,   1,   4,  ...,  -1,  -2,   3],
        [  2,  11,  11,  ...,  -6,  -3,   1],
        [  3,   3,   6,  ...,  -8,  -7,  -4],
        ...,
        [  5,   5,  14,  ..., -14,  -6,  -6],
        [  4,  -6,   7,  ...,  -2,  -5,  -3],
        [ -5,  -9,  -4,  ...,  11,   2,   0]], device='cuda:0',
       dtype=torch.int8)


In [None]:
import torch
from transformers import AutoModelForCausalLM

def absmax_quantize_4bit(X):
    max_q = 7
    min_q = -8
    scale = max_q / torch.max(torch.abs(X))
    X_quant = torch.clamp((scale * X).round(), min_q, max_q)
    X_dequant = X_quant / scale
    return X_quant.to(torch.int8), X_dequant

# Load model
model_name = "your_model_name"
device = "cuda" if torch.cuda.is_available() else "cpu"
model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16).to(device)
model.eval()

# Apply 4-bit quantization to all Q/K/V/O projections in all layers
for i, layer in enumerate(model.model.layers):
    attn = layer.self_attn
    for proj_name in ["q_proj", "k_proj", "v_proj", "o_proj"]:
        weight_fp = getattr(attn, proj_name).weight.data.float()
        weight_int4, weight_dequant = absmax_quantize_4bit(weight_fp)
        
        print(f"\n[Layer {i}] {proj_name}")
        print("Original (FP16):", weight_fp.shape)
        print("Quantized (int4):", weight_int4.shape)
        print("Dequantized sample:", weight_dequant.view(-1)[:5])



[Layer 0] q_proj
Original (FP16): torch.Size([2048, 2048])
Quantized (int4): torch.Size([2048, 2048])
Dequantized sample: tensor([-0., 0., 0., -0., -0.], device='cuda:0')

[Layer 0] k_proj
Original (FP16): torch.Size([512, 2048])
Quantized (int4): torch.Size([512, 2048])
Dequantized sample: tensor([ 0.0938,  0.0938,  0.0938, -0.0000, -0.0938], device='cuda:0')

[Layer 0] v_proj
Original (FP16): torch.Size([512, 2048])
Quantized (int4): torch.Size([512, 2048])
Dequantized sample: tensor([0.0100, -0.0000, 0.0100, 0.0000, 0.0100], device='cuda:0')

[Layer 0] o_proj
Original (FP16): torch.Size([2048, 2048])
Quantized (int4): torch.Size([2048, 2048])
Dequantized sample: tensor([0., 0., 0., 0., 0.], device='cuda:0')

[Layer 1] q_proj
Original (FP16): torch.Size([2048, 2048])
Quantized (int4): torch.Size([2048, 2048])
Dequantized sample: tensor([-0.0592,  0.0000, -0.0000, -0.0000, -0.0592], device='cuda:0')

[Layer 1] k_proj
Original (FP16): torch.Size([512, 2048])
Quantized (int4): torch.Si

In [8]:
import torch
import numpy as np
from copy import deepcopy

def absmax_quantize_4bit(X):
    max_q = 7
    min_q = -8
    scale = max_q / torch.max(torch.abs(X))
    X_quant = torch.clamp((scale * X).round(), min_q, max_q)
    X_dequant = X_quant / scale
    return X_quant.to(torch.int8), X_dequant

# Store original weights
weights_fp = [param.data.clone() for param in model.parameters()]

# Create quantized model
model_abs = deepcopy(model)

# Apply 4-bit quantization to all parameters
weights_abs = []
with torch.no_grad():
    for param in model_abs.parameters():
        _, dequantized = absmax_quantize_4bit(param.data.float())
        param.data.copy_(dequantized.to(param.dtype))
        weights_abs.append(dequantized)


In [10]:
from transformers import AutoTokenizer

# 1. Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)

# 2. 텍스트 생성 함수
def generate_text(model, prompt, max_new_tokens=50):
    model.eval()
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    with torch.no_grad():
        output = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            do_sample=False,
            pad_token_id=tokenizer.eos_token_id
        )
    return tokenizer.decode(output[0], skip_special_tokens=True)

# 3. PPL 계산 함수
def compute_ppl(model, prompt):
    model.eval()
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    input_ids = inputs["input_ids"]
    with torch.no_grad():
        outputs = model(input_ids, labels=input_ids)
    loss = outputs.loss
    return torch.exp(loss).item()

# 4. 테스트 Prompt
prompt = "Tell me a fun fact about the moon."

# 5. Full-Precision 모델 결과
print("\n🧪 [Full-Precision Model Output]")
print(generate_text(model, prompt))
print(f"PPL: {compute_ppl(model, prompt):.2f}")

# 6. 양자화된 모델 결과
print("\n🧪 [4-bit Quantized Model Output]")
print(generate_text(model_abs, prompt))
print(f"PPL: {compute_ppl(model_abs, prompt):.2f}")



🧪 [Full-Precision Model Output]




Tell me a fun fact about the moon. I’m going to guess you’re going to say it’s the only natural satellite in the solar system. I’m going to guess you’re going to say it’s the only natural satellite in the solar system. I’m going to guess you’re going
PPL: 50.70

🧪 [4-bit Quantized Model Output]
Tell me a fun fact about the moon..attach.attach.attach.attach.attach.attach.attach.attach.attach.attach.attach.attach.attach.attach.attach.attach.attach.attach.attach.attach.attach.attach.attach.attach.attach.attach.attach.attach.attach.attach.attach.attach.attach.attach.attach.attach.attach.attach.attach.attach.attach.attach.attach.attach.attach.attach.attach.attach.attach.attach
PPL: 846522.00


In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
import torch

# 1. 모델 이름
model_name = "your_model_name"

# 2. BitsAndBytes 4-bit 양자화 설정
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",  # or "fp4"
    bnb_4bit_compute_dtype=torch.bfloat16
)

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map="auto"
)


# 5. 텍스트 생성 함수
def generate_text(model, prompt, max_new_tokens=50):
    model.eval()
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    with torch.no_grad():
        output = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            do_sample=False,
            pad_token_id=tokenizer.eos_token_id
        )
    return tokenizer.decode(output[0], skip_special_tokens=True)

# 6. PPL 계산 함수
def compute_ppl(model, prompt):
    model.eval()
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    input_ids = inputs["input_ids"]
    with torch.no_grad():
        outputs = model(input_ids, labels=input_ids)
    loss = outputs.loss
    return torch.exp(loss).item()

# 7. 테스트 Prompt
prompt = "Tell me a fun fact about the moon."

print("\n🧪 [4-bit NF4 Quantized Model Output]")
print(generate_text(model, prompt))
print(f"PPL: {compute_ppl(model, prompt):.2f}")


ValueError: Using a `device_map` or `tp_plan` requires `accelerate`. You can install it with `pip install accelerate`