In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os

os.environ["CUDA_VISIBLE_DEVICES"] = "7"

In [3]:
import awq

In [4]:
import tqdm
import torch
from torch import nn
from transformers import AutoModelForCausalLM, AutoTokenizer
from datasets import load_dataset
from functools import partial
import gc

In [5]:
def evaluate(model, tokenizer, nsamples: int = 40):
    testenc = load_dataset("wikitext", "wikitext-2-raw-v1", split="test")
    testenc = tokenizer("\n\n".join(testenc["text"]), return_tensors="pt")

    testenc = testenc.input_ids.to(model.device)
    model = model.eval()

    nlls = []
    for i in tqdm.tqdm(range(nsamples), desc="evaluating..."):
        batch = testenc[:, (i * 2048) : ((i + 1) * 2048)].to(model.device)
        with torch.no_grad():
            lm_logits = model(batch).logits
        shift_logits = lm_logits[:, :-1, :].contiguous().float()
        shift_labels = testenc[:, (i * 2048) : ((i + 1) * 2048)][:, 1:]

        loss_fct = nn.CrossEntropyLoss()

        loss = loss_fct(
            shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1)
        )
        neg_log_likelihood = loss.float() * 2048
        nlls.append(neg_log_likelihood)

    return torch.exp(torch.stack(nlls).sum() / (nsamples * 2048))

In [6]:
def get_model_size(model: nn.Module, data_width=16, group_size=-1):

    if group_size != -1:
        data_width += (16 + 4) / group_size

    num_elements = 0
    for param in model.parameters():
        num_elements += param.numel()
    return num_elements * data_width


Byte = 8
KiB = 1024 * Byte
MiB = 1024 * KiB
GiB = 1024 * MiB

In [7]:
# model_path = "facebook/opt-1.3b"

# model_path = "facebook/opt-2.7b"

model_path = "facebook/opt-6.7b"

tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=False)



In [8]:
original_model_n_bits = 16
torch_dtype = torch.float16 if original_model_n_bits == 16 else torch.float32

model = AutoModelForCausalLM.from_pretrained(
    model_path, torch_dtype=torch_dtype, device_map="auto"
)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]



In [9]:

# # Evaluate the model
# model_perplexity = evaluate(model, tokenizer)
# model_size = get_model_size(model, data_width=original_model_n_bits, group_size=128)

# ### Print the results
# print(f"\nmodel perplexity: {model_perplexity:.2f}")
# print(f"model size: {model_size/MiB:.2f} MiB")

In [14]:
from awq.quantize.pre_quant_new import run_awq, apply_awq
from typing import Literal

ActQuantType = Literal["per_token", "per_tensor", "none", "per_channel"]

q_config = {
    "zero_point": True,  # by default True
    "q_group_size": 128,  # whether to use group quantization
    "w_n_bits": 4,
    "a_n_bits": 4,
    "act_quant": "none",
}


In [None]:
model = AutoModelForCausalLM.from_pretrained(
    model_path, torch_dtype=torch_dtype, device_map="auto"
)

awq_results = run_awq(
    model,
    tokenizer,
    w_bit=q_config["w_n_bits"],
    q_config=q_config,
    n_samples=128,
    seqlen=512,
)


In [None]:
q_config

In [13]:
# dump_awq = "awq_results.pt"
# torch.save(awq_results, dump_awq)
# print("AWQ results saved at", dump_awq)

In [14]:
# ### load awq
# load_awq = "awq_results.pt"
# awq_results = torch.load(load_awq, map_location="cpu")

In [24]:
from transformers.models.opt.modeling_opt import OPTDecoderLayer
from awq.quantize.fake_quant_new import quantize_opt_model

model_path = "facebook/opt-2.7b"

model = AutoModelForCausalLM.from_pretrained(
    model_path, torch_dtype=torch_dtype, device_map="auto"
)

# apply the AWQ results
# apply_awq(model, awq_results)


model = quantize_opt_model(
    model,
    w_n_bits=q_config["w_n_bits"],
    a_n_bits=q_config["a_n_bits"],
    # act_quant=q_config["act_quant"],
    act_quant="per_token",
    group_size=q_config["q_group_size"],
)




In [25]:
torch.cuda.empty_cache()
model.cuda()

OPTForCausalLM(
  (model): OPTModel(
    (decoder): OPTDecoder(
      (embed_tokens): Embedding(50272, 2560, padding_idx=1)
      (embed_positions): OPTLearnedPositionalEmbedding(2050, 2560)
      (final_layer_norm): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)
      (layers): ModuleList(
        (0-31): 32 x OPTDecoderLayer(
          (self_attn): OPTAttention(
            (k_proj): Linear(in_features=2560, out_features=2560, bias=True)
            (v_proj): Linear(in_features=2560, out_features=2560, bias=True)
            (q_proj): Linear(in_features=2560, out_features=2560, bias=True)
            (out_proj): Linear(in_features=2560, out_features=2560, bias=True)
          )
          (activation_fn): ReLU()
          (self_attn_layer_norm): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)
          (fc1): QuantizedLinear()
          (fc2): QuantizedLinear()
          (final_layer_norm): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)
        )
      )
    )
 

In [26]:
# Evaluate the model
model_perplexity = evaluate(model, tokenizer, nsamples=100)
model_size = get_model_size(
    model, data_width=q_config["w_n_bits"], group_size=q_config["q_group_size"]
)
print(f"\nmodel perplexity: {model_perplexity:.2f}")
print(f"model size: {model_size/MiB:.2f} MiB")


evaluating...: 100%|██████████| 100/100 [00:31<00:00,  3.22it/s]


model perplexity: 18856.74
model size: 482.32 MiB





## RANDOM STUFF

In [17]:
from awq.quantize.pre_quant_new import get_blocks


In [18]:
for layer in get_blocks(model):
    # layer.fc1
    # layer.fc2
    # print("hi")
    break

In [19]:
from copy import deepcopy

x = deepcopy(layer)

In [None]:
x.fc1.weight.data[:] = 1.0
# x.fc1.weight.data

layer.fc1.weight.data

In [None]:
isinstance(layer, OPTDecoderLayer)