In [None]:
from awq.quantize.quantizer import AwqQuantizer
from awq import AutoAWQForCausalLM
from awq.evaluation import *
from transformers import AutoTokenizer
import os
import torch
from transformers.models.llama.modeling_llama import *
from awq.modules.linear import (
    WQLinear_GEMM,
    WQLinear_GEMV,
    WQLinear_Marlin,
    WQLinear_GEMVFast,
)
from model2bin import *

QUANT_GROUP = 128
SCALE_UP = 4

model_path = '../meta-llama/Llama-2-7b-chat-hf'
quant_path = '../meta-llama/Llama-2-7b-chat-hf-awq-fpga'
quant_config = { "zero_point": True, "q_group_size": 128, "w_bit": 4, "version": "GEMV" , "modules_to_not_convert": ["mlp.down_proj"]}

In [None]:
# quantize the model using AWQ

model = AutoAWQForCausalLM.from_pretrained(model_path, device_map="cuda:0")
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)

model.quantize(tokenizer, quant_config=quant_config)

model.save_quantized(quant_path)
tokenizer.save_pretrained(quant_path)

In [None]:
# directly load the quantized model

model = AutoAWQForCausalLM.from_quantized(quant_path, fuse_layers=False, max_memory={4: "8GB"})
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)

model.model = model.model.to("cuda:4")

In [None]:
# gen_bin for kv260, the PS DDR address space is split into two parts

quant_model = LlamaModelFromAWQ(model.model, QUANT_GROUP, 1, [4, 1], SCALE_UP)

first_pack = quant_model.gen_model_bin_first_half(1024, 0)
print(len(first_pack))
second_pack = quant_model.gen_model_bin_second_half(1024, 0)
print(len(second_pack))

with open("llama0.bin", "wb") as f:
    f.write(first_pack)

with open("llama1.bin", "wb") as f:
    f.write(second_pack)

In [None]:
# gen_bin for zcu104 with PS and PL DDR

quant_model = LlamaModelFromAWQ(model.model, QUANT_GROUP, 2, [4, 1], SCALE_UP)

model_pack_ps = quant_model.gen_model_bin(1024, 0)
print(len(model_pack_ps))
model_pack_pl = quant_model.gen_model_bin(1024, 1)
print(len(model_pack_pl))

with open("llmps.bin", "wb") as f:
    f.write(model_pack_ps)

with open("llmpl.bin", "wb") as f:
    f.write(model_pack_pl)

In [None]:
# gen_bin for zcu104 with PL DDR only

quant_model = LlamaModelFromAWQ(model.model, QUANT_GROUP, 1, [1], SCALE_UP)

model_pack = quant_model.gen_model_bin(1024, 0)
print(len(model_pack))

with open("migpl.bin", "wb") as f:
    f.write(model_pack_ps)

In [None]:
# gen_bin for boards with two PL MIG DDRs

quant_model = LlamaModelFromAWQ(model.model, QUANT_GROUP, 2, [1, 1], SCALE_UP)

pack_0 = quant_model.gen_model_bin(1024, 0)
print(len(pack_0))
pack_1 = quant_model.gen_model_bin(1024, 1)
print(len(pack_1))

with open("llmc0.bin", "wb") as f:
    f.write(pack_0)

with open("llmc1.bin", "wb") as f:
    f.write(pack_1)

In [None]:
# gen_bin for boards with four PL MIG DDRs

quant_model = LlamaModelFromAWQ(model.model, QUANT_GROUP, 4, [1, 1, 1, 1], SCALE_UP)

pack_0 = quant_model.gen_model_bin(1024, 0)
print(len(pack_0))
pack_1 = quant_model.gen_model_bin(1024, 1)
print(len(pack_1))
pack_2 = quant_model.gen_model_bin(1024, 2)
print(len(pack_2))
pack_3 = quant_model.gen_model_bin(1024, 3)
print(len(pack_3))

with open("llmc0.bin", "wb") as f:
    f.write(pack_0)

with open("llmc1.bin", "wb") as f:
    f.write(pack_1)

with open("llmc2.bin", "wb") as f:
    f.write(pack_2)

with open("llmc3.bin", "wb") as f:
    f.write(pack_3)