In [1]:
# 第八章作业
# 作业2： 使用 AWQ 量化 Facebook OPT-6.7B 模型
import os

os.environ['HF_HOME'] = 'D:/MTIDE/.cache/huggingface'
os.environ['HF_HUB_CACHE'] = 'D:/MTIDE/.cache/huggingface/hub'

from awq import AutoAWQForCausalLM
from transformers import AutoModelForCausalLM, AutoTokenizer, GPTQConfig
import torch

# 使用预先从 HuggingFace 下载的模型文件
# model_name_or_path = "facebook\opt-6.7b"
model_name_or_path = "D:/MTIDE/code/AI/models/facebook/opt-6.7b"
quant_model_dir = 'models/opt-6.7b-awq'

quant_config = {
    "zero_point": True,
    "q_group_size": 128,
    "w_bit": 4,
    "version": "GEMM"
}

# 加载模型
model = AutoAWQForCausalLM.from_pretrained(model_name_or_path, trust_remote_code=True)
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [2]:
# 量化模型
model.quantize(tokenizer, quant_config=quant_config)

Using the latest cached version of the dataset since mit-han-lab/pile-val-backup couldn't be found on the Hugging Face Hub
Found the latest cached dataset configuration 'default' at D:\MTIDE\.cache\huggingface\datasets\mit-han-lab___pile-val-backup\default\0.0.0\2f5e46ae6a69cf0dce4b12f78241c408936ca0e4 (last modified on Tue Apr  9 09:46:00 2024).
AWQ: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 32/32 [11:19<00:00, 21.25s/it]


In [3]:
# Transformers 兼容性配置abs
from transformers import AwqConfig, AutoConfig

# 修改配置文件以使其与transformers集成兼容
quantization_config = AwqConfig(
    bits=quant_config["w_bit"],
    group_size=quant_config["q_group_size"],
    zero_point=quant_config["zero_point"],
    version=quant_config["version"].lower(),
).to_dict()

# 预训练的transformers模型存储在model属性中，我们需要传递一个字典
model.model.config.quantization_config = quantization_config

In [4]:
# 保存模型权重
model.save_quantized(quant_model_dir)
# 保存分词器
tokenizer.save_pretrained(quant_model_dir)

('models/opt-6.7b-awq\\tokenizer_config.json',
 'models/opt-6.7b-awq\\special_tokens_map.json',
 'models/opt-6.7b-awq\\vocab.json',
 'models/opt-6.7b-awq\\merges.txt',
 'models/opt-6.7b-awq\\added_tokens.json',
 'models/opt-6.7b-awq\\tokenizer.json')

In [5]:
model.eval()

OptAWQForCausalLM(
  (model): OPTForCausalLM(
    (model): OPTModel(
      (decoder): OPTDecoder(
        (embed_tokens): Embedding(50272, 4096, padding_idx=1)
        (embed_positions): OPTLearnedPositionalEmbedding(2050, 4096)
        (final_layer_norm): LayerNorm((4096,), eps=1e-05, elementwise_affine=True)
        (layers): ModuleList(
          (0-31): 32 x OPTDecoderLayer(
            (self_attn): OPTAttention(
              (k_proj): WQLinear_GEMM(in_features=4096, out_features=4096, bias=True, w_bit=4, group_size=128)
              (v_proj): WQLinear_GEMM(in_features=4096, out_features=4096, bias=True, w_bit=4, group_size=128)
              (q_proj): WQLinear_GEMM(in_features=4096, out_features=4096, bias=True, w_bit=4, group_size=128)
              (out_proj): WQLinear_GEMM(in_features=4096, out_features=4096, bias=True, w_bit=4, group_size=128)
            )
            (activation_fn): ReLU()
            (self_attn_layer_norm): LayerNorm((4096,), eps=1e-05, elementwise_affin

In [6]:
# 使用 GPU 加载量化模型
from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained(quant_model_dir)
model = AutoModelForCausalLM.from_pretrained(quant_model_dir, device_map="cuda").to(0)

In [7]:
def generate_text(text):
    inputs = tokenizer(text, return_tensors="pt").to(0)

    out = model.generate(**inputs, max_new_tokens=64)
    return tokenizer.decode(out[0], skip_special_tokens=True)

In [8]:
result = generate_text("Merry Christmas! I'm glad to")
print(result)

Merry Christmas! I'm glad to hear you're enjoying it. :D
Thank you and thank you again for the card and candy ;)


In [9]:
result = generate_text("The woman worked as a")
print(result)

The woman worked as a hairdresser?
This is what I saw as well. It seems strange but I don't think she deserves this.
