# AWQ on Qwen2 (2B)
In this notebook, we use a ~2B Qwen2 language model to demonstrate the performance of AWQ on large language models. We implement AWQ real-INT4 inference kernels, which are wrapped as PyTorch modules and can be easily used by existing models. We also provide a simple example to show how to use AWQ to quantize a model and save/load the quantized model checkpoint.

In order to run this notebook, you need to install the following packages:

- [AWQ](https://github.com/mit-han-lab/llm-awq)
- [PyTorch](https://pytorch.org/)
- [Transformers](https://github.com/huggingface/transformers)
- [Accelerate](https://github.com/huggingface/accelerate)


In [None]:
import torch
import numpy as np

import gc

from transformers import AutoConfig, AutoTokenizer
from accelerate import load_checkpoint_and_dispatch

from awq.quantize.pre_quant import run_awq, apply_awq
from awq.quantize.quantizer import real_quantize_model_weight

from tinychat.utils.load_quant import load_awq_model, load_awq_llama_fast
from tinychat.utils.tune import device_warmup, tune_all_wqlinears
from tinychat.utils.prompt_templates import get_prompter, get_stop_token_ids
from tinychat.stream_generators import StreamGenerator
from tinychat.models.qwen2 import Qwen2ForCausalLM
from tinychat.modules import make_quant_norm, make_quant_attn, make_fused_mlp

import os

os.environ['CUDA_VISIBLE_DEVICES'] = '0'
# This demo only supports a single GPU for now

# Point this to your local or HF-downloaded ~2B Qwen2 checkpoint
model_path = "Qwen/Qwen2-VL-2B"  # Please change here
# Paths for AWQ search results and quantized weights specific to this 2B model
awq_path = 'awq_cache/qwen2-2b-w4-g128.pt'
quant_path = '/home/ubuntu/cs259_project/llm-awq/awq_cache/qwen2-vl-2b-instruct-w4-g128.pt'

device = 'cuda'


In [None]:
from transformers import AutoProcessor, AutoModelForVision2Seq

model = AutoModelForVision2Seq.from_pretrained(
    model_path,
    low_cpu_mem_usage=True,
    torch_dtype=torch.float16,
    use_cache=True,
).cuda()
model = model.language_model

Please get the Qwen2 ~2B model (e.g., a 2B-scale Qwen2/Qwen2.5 checkpoint) from Hugging Face and run the following cell to generate a quantized model checkpoint first. We only quantize the language decoder, which dominates the model parameters as well as **the generation speed**.

Skip this part if the quantized checkpoints are already prepared.


In [5]:
from transformers.models.qwen2 import Qwen2ForCausalLM as Qwen2ForCausalLMFP16

model = Qwen2ForCausalLMFP16.from_pretrained(
    model_path,
    low_cpu_mem_usage=True,
    torch_dtype=torch.float16,
    use_cache=True,
).cuda()

# Load pre-computed AWQ search results (run AWQ search separately if needed)
awq_results = torch.load(
    awq_path,  # '../awq_cache/qwen2-2b-w4-g128.pt'
    map_location='cpu',
)

# Apply AWQ and generate real quantized weights
apply_awq(model, awq_results)
real_quantize_model_weight(
    model,
    w_bit=4,
    q_config={'zero_point': True, 'q_group_size': 128},
)

torch.save(model.cpu().state_dict(), quant_path)

# Clean up
del model
gc.collect()
torch.cuda.empty_cache()


You are using a model of type qwen2_vl to instantiate a model of type qwen2. This is not supported for all configurations of models and can yield errors.
Missing validation function mapping in `ROPE_VALIDATION_FUNCTIONS` for 'rope_type'='mrope'
Fetching 2 files: 100%|██████████| 2/2 [00:10<00:00,  5.32s/it]


KeyError: 'mrope'

We then load the quantized Qwen2 2B model. We first initialize an empty model and replace all the linear layers with WQLinear layers. After that, we load the quantized weights from the checkpoint.


In [None]:
def skip(*args, **kwargs):
    pass

# Accelerate model initialization
setattr(torch.nn.Linear, 'reset_parameters', lambda self: None)
setattr(torch.nn.LayerNorm, 'reset_parameters', lambda self: None)

torch.nn.init.kaiming_uniform_ = skip
torch.nn.init.kaiming_normal_ = skip
torch.nn.init.uniform_ = skip
torch.nn.init.normal_ = skip

# Tokenizer and config
tokenizer = AutoTokenizer.from_pretrained(
    model_path,
    use_fast=False,
    trust_remote_code=True,
)
config = AutoConfig.from_pretrained(model_path, trust_remote_code=True)

# Initialize TinyChat Qwen2 wrapper
model = Qwen2ForCausalLM(config).half()

# Load quantized model
model = load_awq_llama_fast(model, quant_path, 4, 128, device)

# Optimize for inference speed
make_quant_attn(model, device)
make_quant_norm(model)
make_fused_mlp(model)

model = model.to(device)


Now, let's define the configurations for the conversation.


In [None]:
from attributedict.collections import AttributeDict

# Conversation parameters
gen_params = AttributeDict(
    [
        ('seed', -1),  # RNG seed
        ('n_threads', 1),  # TODO: fix this
        ('n_predict', 512),  # new tokens to predict
        ('n_parts', -1),  # amount of model parts (-1: determine from model dimensions)
        ('n_ctx', 512),  # context size
        ('n_batch', 512),  # batch size for prompt processing (must be >=32 to use BLAS)
        ('n_keep', 0),  # number of tokens to keep from initial prompt
        ('n_vocab', 50272),  # vocabulary size
        # sampling parameters
        ('logit_bias', dict()),  # logit bias for specific tokens: <int, float>
        ('top_k', 40),  # <= 0 to use vocab size
        ('top_p', 0.95),  # 1.0 = disabled
        ('tfs_z', 1.00),  # 1.0 = disabled
        ('typical_p', 1.00),  # 1.0 = disabled
        ('temp', 0.20),  # 1.0 = disabled
        ('repeat_penalty', 1.10),  # 1.0 = disabled
        (
            'repeat_last_n',
            64,
        ),  # last n tokens to penalize (0 = disable penalty, -1 = context size)
        ('frequency_penalty', 0.00),  # 0.0 = disabled
        ('presence_penalty', 0.00),  # 0.0 = disabled
        ('mirostat', 0),  # 0 = disabled, 1 = mirostat, 2 = mirostat 2.0
        ('mirostat_tau', 5.00),  # target entropy
        ('mirostat_eta', 0.10),  # learning rate
    ]
)


We add the output streamer to manage the generation process.


In [None]:
def stream_output(output_stream):
    print('ASSISTANT: ', end='', flush=True)
    pre = 0

    for outputs in output_stream:
        output_text = outputs['text']
        output_text = output_text.strip().split(' ')
        now = len(output_text) - 1

        if now > pre:
            print(' '.join(output_text[pre:now]), end=' ', flush=True)
            pre = now

    print(' '.join(output_text[pre:]), flush=True)

    if 'timing' in outputs and outputs['timing'] is not None:
        timing = outputs['timing']
        context_tokens = timing['context_tokens']
        context_time = timing['context_time']
        total_tokens = timing['total_tokens']
        generation_time_list = timing['generation_time_list']
        generation_tokens = len(generation_time_list)
        average_speed = (context_time + np.sum(generation_time_list)) / (
            context_tokens + generation_tokens
        )
        print('=' * 50)
        print('Speed of Inference')
        print('-' * 50)
        print(
            f'Generation Stage : {np.average(generation_time_list) * 1000:.2f} ms/token'
        )
        print('=' * 50)

    return ' '.join(output_text)


Finally, we can use the model for generation.


In [None]:
# Example question
query = 'Explain the benefits of weight-only quantization for large language models.'

# Prepare the prompter and stop tokens
model_prompter = get_prompter('qwen', model_path, short_prompt=False)
stop_token_ids = get_stop_token_ids('qwen', model_path)

print(f'USER: {query}')

# Insert user query into the prompt template
model_prompter.insert_prompt(query)

# Use the generic TinyChat stream generator for Qwen2
stream_generator = StreamGenerator

output_stream = stream_generator(
    model,
    tokenizer,
    model_prompter.model_input,
    0,  # start_pos
    gen_params,
    device=device,
    stop_token_ids=stop_token_ids,
    quant_llm=True,
)

outputs = stream_output(output_stream)
