In [1]:
from awq import AutoAWQForCausalLM
from transformers import AutoTokenizer

model_name_or_path = "./model/CodeLlama-13b-Instruct-hf"
quant_model_dir = "models/lama-13b-awq"


quant_config = {
    "zero_point": True,
    "q_group_size": 128,
    "w_bit": 4,
    "version": "GEMM"
}

# 加载模型
model = AutoAWQForCausalLM.from_pretrained(model_name_or_path, trust_remote_code=True)
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)

model.quantize(tokenizer, quant_config=quant_config)

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Generating validation split: 0 examples [00:00, ? examples/s]

AWQ: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 40/40 [19:53<00:00, 29.84s/it]


In [2]:
quant_config

{'zero_point': True, 'q_group_size': 128, 'w_bit': 4, 'version': 'GEMM'}

In [3]:
from transformers import AwqConfig, AutoConfig

# 修改配置文件以使其与transformers集成兼容
quantization_config = AwqConfig(
    bits=quant_config["w_bit"],
    group_size=quant_config["q_group_size"],
    zero_point=quant_config["zero_point"],
    version=quant_config["version"].lower(),
).to_dict()

# 预训练的transformers模型存储在model属性中，我们需要传递一个字典
model.model.config.quantization_config = quantization_config

In [4]:
# 保存模型权重
model.save_quantized(quant_model_dir)
# 保存分词器
tokenizer.save_pretrained(quant_model_dir)  

('models/lama-13b-awq/tokenizer_config.json',
 'models/lama-13b-awq/special_tokens_map.json',
 'models/lama-13b-awq/tokenizer.model',
 'models/lama-13b-awq/added_tokens.json',
 'models/lama-13b-awq/tokenizer.json')

In [5]:
model.eval()

LlamaAWQForCausalLM(
  (model): LlamaForCausalLM(
    (model): LlamaModel(
      (embed_tokens): Embedding(32016, 5120)
      (layers): ModuleList(
        (0-39): 40 x LlamaDecoderLayer(
          (self_attn): LlamaSdpaAttention(
            (q_proj): WQLinear_GEMM(in_features=5120, out_features=5120, bias=False, w_bit=4, group_size=128)
            (k_proj): WQLinear_GEMM(in_features=5120, out_features=5120, bias=False, w_bit=4, group_size=128)
            (v_proj): WQLinear_GEMM(in_features=5120, out_features=5120, bias=False, w_bit=4, group_size=128)
            (o_proj): WQLinear_GEMM(in_features=5120, out_features=5120, bias=False, w_bit=4, group_size=128)
            (rotary_emb): LlamaRotaryEmbedding()
          )
          (mlp): LlamaMLP(
            (gate_proj): WQLinear_GEMM(in_features=5120, out_features=13824, bias=False, w_bit=4, group_size=128)
            (up_proj): WQLinear_GEMM(in_features=5120, out_features=13824, bias=False, w_bit=4, group_size=128)
            (do

In [6]:
from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained(quant_model_dir)
model = AutoModelForCausalLM.from_pretrained(quant_model_dir, device_map="cuda").to(0)
def generate_text(text):
    inputs = tokenizer(text, return_tensors="pt").to(0)

    out = model.generate(**inputs, max_new_tokens=64)
    return tokenizer.decode(out[0], skip_special_tokens=True)


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [7]:
result = generate_text("Merry Christmas! I'm glad to")
print(result)

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Merry Christmas! I'm glad to see that you are using "Chistmas". We are trying to make the word "Christmas" less offensive by using "Christmas" instead of "Merry Christmas". To be fair, we are only trying to change Christmas for a little while.
Now you may be wondering what does this "Mer


In [8]:
result = generate_text("The woman worked as a")
print(result)

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


The woman worked as a volunteer for the hospital, assisting nurses at night, and in the hospital's emergency room on weekends.
The nurse told police she's been working at the hospital for 30 years – and she said 'it was no problem,' when a woman volunteered to help her
