In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os

os.environ["CUDA_VISIBLE_DEVICES"] = "6"

In [3]:
import awq

In [4]:
import tqdm
import torch
from torch import nn
from transformers import AutoModelForCausalLM, AutoTokenizer
from datasets import load_dataset
from functools import partial
import gc

In [5]:
def evaluate(model, tokenizer):
    testenc = load_dataset("wikitext", "wikitext-2-raw-v1", split="test")
    testenc = tokenizer("\n\n".join(testenc["text"]), return_tensors="pt")

    testenc = testenc.input_ids.to(model.device)
    nsamples = 40
    model = model.eval()

    nlls = []
    for i in tqdm.tqdm(range(nsamples), desc="evaluating..."):
        batch = testenc[:, (i * 2048) : ((i + 1) * 2048)].to(model.device)
        with torch.no_grad():
            lm_logits = model(batch).logits
        shift_logits = lm_logits[:, :-1, :].contiguous().float()
        shift_labels = testenc[:, (i * 2048) : ((i + 1) * 2048)][:, 1:]
        loss_fct = nn.CrossEntropyLoss()
        loss = loss_fct(
            shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1)
        )
        neg_log_likelihood = loss.float() * 2048
        nlls.append(neg_log_likelihood)

    return torch.exp(torch.stack(nlls).sum() / (nsamples * 2048))

In [6]:
def get_model_size(model: nn.Module, data_width=16, group_size=-1):

    if group_size != -1:
        data_width += (16 + 4) / group_size

    num_elements = 0
    for param in model.parameters():
        num_elements += param.numel()
    return num_elements * data_width


Byte = 8
KiB = 1024 * Byte
MiB = 1024 * KiB
GiB = 1024 * MiB

In [7]:
# model_path = "facebook/opt-1.3b"

model_path = "facebook/opt-13b"

tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=False)
model = AutoModelForCausalLM.from_pretrained(model_path, device_map="auto")



Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]



In [None]:
# Evaluate the model
model_perplexity = evaluate(model, tokenizer)
model_size = get_model_size(model, data_width=32, group_size=128)
print(f"\nmodel perplexity: {model_perplexity:.2f}")
print(f"model size: {model_size/MiB:.2f} MiB")

evaluating...:  32%|███▎      | 13/40 [01:25<02:59,  6.65s/it]

### Naive Quantization

In [None]:
# from awq.quantize.quantizer import (
#     real_quantize_model_weight,
#     pseudo_quantize_model_weight,
#     pseudo_quantize_tensor,
# )

# def quantize_opt_1(
#     model,
#     w_n_bits: int = 4,
#     a_n_bits: int = 4,
#     zero_point: bool = True,
#     group_size: int = 128,
# ):
#     from transformers.models.opt.modeling_opt import (
#         OPTAttention,
#         OPTDecoderLayer,
#     )

#     for name, m in model.model.named_modules():
#         if isinstance(m, OPTDecoderLayer):
#             m.fc1.weight.data = pseudo_quantize_tensor(
#                 m.fc1.weight.data,
#                 n_bit=w_n_bits,
#                 zero_point=zero_point,
#                 q_group_size=group_size,
#             )
#             m.fc2.weight.data = pseudo_quantize_tensor(
#                 m.fc2.weight.data,
#                 n_bit=w_n_bits,
#                 zero_point=zero_point,
#                 q_group_size=group_size,
#             )

#     return model

In [None]:
#     "zero_point": True,  # by default True
#     "q_group_size": 128,  # whether to use group quantization
# }

# model = AutoModelForCausalLM.from_pretrained(model_path, device_map="auto")
# model = quantize_opt_1(model, w_n_bits=4, zero_point=True, group_size=128)
# model.cuda()
# print()

In [None]:
# # Evaluate the model
# model_perplexity = evaluate(model, tokenizer)
# model_size = get_model_size(model, data_width=32, group_size=128)
# print(f"\nmodel perplexity: {model_perplexity:.2f}")
# print(f"model size: {model_size/MiB:.2f} MiB")

In [None]:
# from awq.quantize.pre_quant import run_awq, apply_awq

# q_config = {
#     "zero_point": True,  # by default True
#     "q_group_size": 128,  # whether to use group quantization
# }

# awq_results = run_awq(
#     model,
#     tokenizer,
#     w_bit=4,
#     q_config=q_config,
#     n_samples=128,
#     seqlen=512,
# )

# dump_awq = "awq_results.pt"
# torch.save(awq_results, dump_awq)
# print("AWQ results saved at", dump_awq)

In [None]:
# dump_awq = "awq_results.pt"
# torch.save(awq_results, dump_awq)
# print("AWQ results saved at", dump_awq)

In [None]:
# from awq.quantize.pre_quant import run_awq, apply_awq


# load_awq = "awq_results.pt"
# awq_results = torch.load(load_awq, map_location="cpu")

# model = AutoModelForCausalLM.from_pretrained(model_path, device_map="auto")
# apply_awq(model, awq_results)

In [None]:
# from awq.quantize.quantizer import (
#     real_quantize_model_weight,
#     pseudo_quantize_model_weight,
# )

# q_config = {
#     "zero_point": True,  # by default True
#     "q_group_size": 128,  # whether to use group quantization
# }

# pseudo_quantize_model_weight(model, w_bit=4, q_config=q_config)
# model.cuda()
# print()

In [None]:
# Evaluate the model
# model_perplexity = evaluate(model, tokenizer)
# model_size = get_model_size(model, data_width=32, group_size=128)
# print(f"\nmodel perplexity: {model_perplexity:.2f}")
# print(f"model size: {model_size/MiB:.2f} MiB")

In [None]:
# from awq.quantize.quantizer import (
#     real_quantize_model_weight,
#     pseudo_quantize_model_weight,
# )

# q_config = {
#     "zero_point": True,  # by default True
#     "q_group_size": 128,  # whether to use group quantization
# }

# pseudo_quantize_model_weight(model, w_bit=4, q_config=q_config)

In [None]:
import torch.nn as nn
import torch.nn.functional as F
from awq.quantize.quantizer import pseudo_quantize_tensor
from typing import Literal


@torch.no_grad()
def quantize_activation_per_token_absmax(t, n_bits=8):
    t_shape = t.shape
    t.view(-1, t_shape[-1])
    scales = t.abs().max(dim=-1, keepdim=True)[0]
    q_max = 2 ** (n_bits - 1) - 1
    scales.clamp_(min=1e-5).div_(q_max)
    t.div_(scales).round_().mul_(scales)
    return t


@torch.no_grad()
def quantize_activation_per_tensor_absmax(t, n_bits=8):
    t_shape = t.shape
    t.view(-1, t_shape[-1])
    scales = t.abs().max()
    q_max = 2 ** (n_bits - 1) - 1
    scales.clamp_(min=1e-5).div_(q_max)
    t.div_(scales).round_().mul_(scales)
    return t


class QuantizedLinear(nn.Module):
    def __init__(
        self,
        in_features: int,
        out_features: int,
        bias: bool = True,
        w_n_bits=4,
        a_n_bits=16,
        act_quant: Literal["per_token", "per_tensor", "none"] = "per_token",
        quantize_output: bool = False,
    ):
        super().__init__()

        self.in_features = in_features
        self.out_features = out_features

        self.register_buffer(
            "weight",
            torch.randn(
                self.out_features,
                self.in_features,
                dtype=torch.float16,
                requires_grad=False,
            ),
        )

        if bias:
            self.register_buffer(
                "bias",
                torch.zeros(
                    (1, self.out_features), dtype=torch.float16, requires_grad=False
                ),
            )
        else:
            self.register_buffer("bias", None)

        if act_quant == "per_token":
            self.act_quant_name = "per_token"
            self.act_quant = partial(
                quantize_activation_per_token_absmax, n_bits=a_n_bits
            )
        elif act_quant == "per_tensor":
            self.act_quant_name = "per_tensor"
            self.act_quant = partial(
                quantize_activation_per_tensor_absmax, n_bits=a_n_bits
            )
        else:
            self.act_quant_name = "None"
            self.act_quant = lambda x: x

        if quantize_output:
            self.output_quant_name = self.act_quant_name
            self.output_quant = self.act_quant
        else:
            self.output_quant_name = "None"
            self.output_quant = lambda x: x

        # self.act_quant = lambda x: x
        # self.output_quant = lambda x: x

    def to(self, *args, **kwargs):
        super(QuantizedLinear, self).to(*args, **kwargs)
        self.weight = self.weight.to(*args, **kwargs)
        if self.bias is not None:
            self.bias = self.bias.to(*args, **kwargs)
        return self

    @torch.no_grad()
    def forward(self, x):
        q_x = self.act_quant(x)
        q_x = x
        y = F.linear(q_x, self.weight, self.bias)
        q_y = self.output_quant(y)
        return q_y

    @classmethod
    def from_linear(
        cls,
        linear: nn.Linear,
        w_n_bits: int = 4,
        a_n_bits: int = 4,
        zero_point: bool = True,
        group_size: int = 128,
        act_quant: Literal["per_token", "per_tensor", "none"] = "per_token",
    ):

        awq_linear = cls(
            linear.in_features,
            linear.out_features,
            bias=linear.bias is not None,
            w_n_bits=w_n_bits,
            a_n_bits=a_n_bits,
            act_quant=act_quant,
        )

        awq_linear.weight.data = pseudo_quantize_tensor(
            w=linear.weight.data,
            n_bit=w_n_bits,
            zero_point=zero_point,
            q_group_size=group_size,
        )

        if linear.bias is not None:
            awq_linear.bias.data = linear.bias.data

        return awq_linear

In [None]:
def quantize_opt(
    model,
    w_n_bits: int = 4,
    a_n_bits: int = 4,
    zero_point: bool = True,
    group_size: int = 128,
    act_quant: Literal["per_token", "per_tensor", "none"] = "per_token",
):
    from transformers.models.opt.modeling_opt import (
        OPTAttention,
        OPTDecoderLayer,
    )

    for name, m in model.model.named_modules():
        if isinstance(m, OPTDecoderLayer):
            m.fc1 = QuantizedLinear.from_linear(
                m.fc1,
                w_n_bits=w_n_bits,
                a_n_bits=a_n_bits,
                zero_point=zero_point,
                group_size=group_size,
                act_quant=act_quant,
            )
            m.fc2 = QuantizedLinear.from_linear(
                m.fc2,
                w_n_bits=w_n_bits,
                a_n_bits=a_n_bits,
                zero_point=zero_point,
                group_size=group_size,
                act_quant=act_quant,
            )

    return model

In [None]:
w_n_bits = 8
a_n_bits = 8

model = AutoModelForCausalLM.from_pretrained(model_path, device_map="auto")
# apply the AWQ results
# apply_awq(model, awq_results)

model = quantize_opt(
    

    model, w_n_bits=w_n_bits, a_n_bits=a_n_bits, act_quant="per_tensor"
)
model.cuda()
print()

In [None]:
# Evaluate the model
model_perplexity = evaluate(model, tokenizer)
model_size = get_model_size(model, data_width=w_n_bits, group_size=128)
print(f"\nmodel perplexity: {model_perplexity:.2f}")
print(f"model size: {model_size/MiB:.2f} MiB")