In [None]:
!pip install vllm autoawq
!pip install git+https://github.com/mobiusml/gemlite/

Collecting vllm
  Downloading vllm-0.9.0-cp38-abi3-manylinux1_x86_64.whl.metadata (15 kB)
Collecting autoawq
  Downloading autoawq-0.2.9.tar.gz (74 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m74.3/74.3 kB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting blake3 (from vllm)
  Downloading blake3-1.0.5-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.2 kB)
Collecting huggingface-hub>=0.32.0 (from huggingface-hub[hf_xet]>=0.32.0->vllm)
  Downloading huggingface_hub-0.32.2-py3-none-any.whl.metadata (14 kB)
Collecting fastapi>=0.115.0 (from fastapi[standard]>=0.115.0->vllm)
  Downloading fastapi-0.115.12-py3-none-any.whl.metadata (27 kB)
Collecting prometheus-fastapi-instrumentator>=7.0.0 (from vllm)
  Downloading prometheus_fastapi_instrumentator-7.1.0-py3-none-any.whl.metadata (13 kB)
Collecting lm-format-enforcer<0.11,>=0.10.11 (from vllm)
  Downloading lm_format_enforcer-0.10.11-p

Collecting git+https://github.com/mobiusml/gemlite/
  Cloning https://github.com/mobiusml/gemlite/ to /tmp/pip-req-build-2o_lyg2h
  Running command git clone --filter=blob:none --quiet https://github.com/mobiusml/gemlite/ /tmp/pip-req-build-2o_lyg2h
  Resolved https://github.com/mobiusml/gemlite/ to commit edbd025ea524df9edf8b8f57933215d9360e264d
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: gemlite
  Building wheel for gemlite (setup.py) ... [?25l[?25hdone
  Created wheel for gemlite: filename=gemlite-0.4.6-py3-none-any.whl size=92036 sha256=d4afe2ffdf8d587702882cb5eed86c5b22edeffb4664bc3d11cec56d3cc9a081
  Stored in directory: /tmp/pip-ephem-wheel-cache-bn6eol4h/wheels/c7/69/6a/8bc68c1802c577fe8bab1939f6e8c2998abfa7bf9054d9f4fe
Successfully built gemlite
Installing collected packages: gemlite
Successfully installed gemlite-0.4.6


### Import headers

In [None]:
import os
import torch
import torch.nn as nn
from tqdm.auto import tqdm
from datasets import load_dataset
import random
import numpy as np

from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM
)
# from hqq.models.hf.base import AutoHQQHFModel   # core HQQ helpers :contentReference[oaicite:3]{index=3}
# from hqq.utils.vllm import (
#     set_vllm_hqq_backend,
#     VLLM_HQQ_BACKEND,
#     set_vllm_onthefly_hqq_quant
# )

from vllm import LLM, SamplingParams

INFO 05-29 10:23:24 [__init__.py:243] Automatically detected platform cuda.


### Utils

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
def generate(model, input_ids, past_key_values, max_new_tokens, tokenizer, vllm_engine=None, prompt=None):

    sampling_params = SamplingParams(max_tokens=max_new_tokens)
    if vllm_engine is not None and prompt is not None:

        # 2) vLLM 生成
        outputs = vllm_engine.generate([prompt], sampling_params)

        # 3) 取第一條輸出 → token_ids
        new_ids = list(outputs[0].outputs[0].token_ids)

        # 4) 拼回原 prompt，轉成 tensor
        new_ids_tensor = torch.tensor(new_ids, device=input_ids.device, dtype=input_ids.dtype)
        full_ids_tensor = torch.cat([input_ids[0], new_ids_tensor]).unsqueeze(0)  # 1 × (prompt+new)

        return full_ids_tensor
    input_ids = input_ids.clone()
    with torch.no_grad():
        # Prefill
        outputs = model.prefill_forward(
            input_ids,
            past_key_values=past_key_values,
            position_ids=None,
            attention_mask=None,
            cache_position=None,
            logits_to_keep=1
        )
        past_key_values = outputs.past_key_values
        next_token = torch.argmax(outputs.logits, dim=-1)
        input_ids = torch.cat([input_ids, next_token], dim=-1)

        # Token-by-token Decoding
        for _ in range(max_new_tokens):
            pos = input_ids.shape[1]
            cache_position = torch.arange(pos, pos + 1, device=input_ids.device, dtype=torch.long)

            outputs = model(
                next_token,
                past_key_values=past_key_values,
                position_ids=cache_position.unsqueeze(0),
                cache_position=cache_position
            )
            logits = outputs.logits
            next_token = torch.argmax(logits, dim=-1)
            input_ids = torch.cat([input_ids, next_token], dim=-1)
            past_key_values = outputs.past_key_values

    return input_ids

def evaluate_ppl(model, tokenizer, device="cuda:0"):
    test_dataset = load_dataset("wikitext", "wikitext-2-raw-v1", split="test")

    test_enc = tokenizer("\n\n".join(test_dataset["text"]), return_tensors="pt")
    model.seqlen = 2048
    test_enc = test_enc.input_ids.to(device)

    nsamples = test_enc.numel() // model.seqlen
    nlls = []
    for i in tqdm(range(nsamples), desc="Evaluating..."):
        batch = test_enc[:, (i * model.seqlen):((i + 1) * model.seqlen)]

        with torch.no_grad():
            lm_logits = model(batch).logits

        shift_logits = lm_logits[:, :-1, :].contiguous().float()
        shift_labels = test_enc[:, (i * model.seqlen):((i + 1) * model.seqlen)][:, 1:]

        loss_fct = nn.CrossEntropyLoss()
        loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
        neg_log_likelihood = loss.float() * model.seqlen
        nlls.append(neg_log_likelihood)

    ppl = torch.exp(torch.stack(nlls).sum() / (nsamples * model.seqlen))

    return ppl.item()

In [None]:
def main_with_optimization():
    ############## Set Up ##############
    torch.manual_seed(0)
    random.seed(0)

    max_new_tokens = 256    # Number of new tokens to generate
    device = 'cuda:0'
    ### === TODO: Load your model (you may change this part) ===
    # model_name = "meta-llama/Llama-3.2-3B-Instruct"
    model_name = "/content/drive/MyDrive/llama3-3b-instruct-awq_2"
    vllm_engine = LLM(
        model=model_name,
        dtype="float16",              # FP16 最普遍，若顯存吃緊可換 "bfloat16"
        max_num_seqs=32,             # T4 建議 32；A100 可到 512
        gpu_memory_utilization=0.985, # 讓 vLLM 幫你算能塞多少 KV cache
        max_model_len=4096,
        # quantization="AWQ"
    )

    model = None

    #####################################

    tokenizer = AutoTokenizer.from_pretrained(model_name)


    # === (Optional) Uncomment the following lines if using the custom generate() function. ===
    # model.prefill_forward = model.forward

    warmup_prompt = "Explain what AI is."
    inputs = tokenizer(warmup_prompt, return_tensors="pt").to(device)
    input_ids = inputs["input_ids"]
    attention_mask = inputs["attention_mask"]

    # === (Optional) Set up StaticCache for manual KV cache management ===
    # from transformers import StaticCache
    # past_key_values = StaticCache(
    #     config=model.config,
    #     max_batch_size=1,
    #     max_cache_len=max_new_tokens + 16,
    #     device=model.device,
    #     dtype=torch.float16
    # )
    past_key_values = None
    ####################################################################

    for i in tqdm(range(5), desc="Warm Up..."):
        #  === Default: use model.generate() for end-to-end warm-up ===
        # _ = model.generate(
        #     input_ids=input_ids,
        #     attention_mask=attention_mask,
        #     max_new_tokens=max_new_tokens,
        #     pad_token_id=tokenizer.eos_token_id,
        # )

        # === (Optional) Use custom generate() if uncommented ===
        _ = generate(model, input_ids, past_key_values, max_new_tokens, tokenizer, vllm_engine, warmup_prompt)
        # past_key_values.reset()

    prompt = "How to learn a new language?"
    inputs = tokenizer(prompt, return_tensors="pt").to(device)
    input_ids = inputs["input_ids"]
    attention_mask = inputs["attention_mask"]
    tputs = []
    time_record = []
    for _ in tqdm(range(10), desc="Test Inference"):
        torch.cuda.synchronize()
        start = torch.cuda.Event(enable_timing=True)
        end = torch.cuda.Event(enable_timing=True)
        start.record()

        # === Default: Use model.generate() for end-to-end timing ===
        # generated = model.generate(
        #     input_ids=input_ids,
        #     attention_mask=attention_mask,
        #     max_new_tokens=max_new_tokens,
        #     pad_token_id=tokenizer.eos_token_id,
        # )

        # === Optional: Use custom generate() if uncommented ===
        generated = generate(model, input_ids, past_key_values, max_new_tokens, tokenizer, vllm_engine, prompt)
        # past_key_values.reset()

        end.record()
        torch.cuda.synchronize()
        elapsed_ms = start.elapsed_time(end)
        tput = max_new_tokens / (elapsed_ms / 1000)
        time_record.append(elapsed_ms / 1000)
        tputs.append(tput)

    response = tokenizer.decode(generated[0][input_ids.shape[1]:], skip_special_tokens=True)
    sorted_tputs = np.sort(tputs)[2:-2]
    org_tput = np.mean(sorted_tputs)
    print(f'Prompt: {prompt}\nResponse: {response}\n')

    print(f'Time Record: {time_record}')
    print(f'Throughput Record: {tputs} toks/s\n')
    print(f'Throughput: {org_tput} toks/s')
    del vllm_engine
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        torch_dtype=torch.float16,
        device_map=device,
    )

    ### Your final throughput result ###

    ppl = evaluate_ppl(model, tokenizer, device)
    print(f"Perplexity (PPL): {ppl}")

    # Save results to CSV
    import csv
    rounded_tput = round(org_tput, 1)
    ppl = round(ppl, 2)
    del model, tokenizer
    with open("result_optim.csv", mode="w", newline="") as file:
        writer = csv.writer(file)
        writer.writerow(["Id", "value"])
        writer.writerow([0, ppl])
        writer.writerow([1, rounded_tput])

In [None]:
main_with_optimization()

INFO 05-29 10:37:04 [__init__.py:31] Available plugins for group vllm.general_plugins:
INFO 05-29 10:37:04 [__init__.py:33] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver
INFO 05-29 10:37:04 [__init__.py:36] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load.
INFO 05-29 10:37:22 [config.py:793] This model supports multiple tasks: {'reward', 'classify', 'generate', 'score', 'embed'}. Defaulting to 'generate'.
INFO 05-29 10:37:23 [llm_engine.py:230] Initializing a V0 LLM engine (v0.9.0) with config: model='/content/drive/MyDrive/llama3-3b-instruct-awq_2', speculative_config=None, tokenizer='/content/drive/MyDrive/llama3-3b-instruct-awq_2', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config={}, tokenizer_revision=None, trust_remote_code=False, dtype=torch.float16, max_seq_len=4096, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, 

Loading safetensors checkpoint shards:   0% Completed | 0/1 [00:00<?, ?it/s]


INFO 05-29 10:38:14 [default_loader.py:280] Loading weights took 45.45 seconds
INFO 05-29 10:38:14 [model_runner.py:1202] Model loading took 2.1367 GiB and 45.963111 seconds
INFO 05-29 10:38:17 [worker.py:291] Memory profiling takes 2.26 seconds
INFO 05-29 10:38:17 [worker.py:291] the current vLLM instance can use total_gpu_memory (14.74GiB) x gpu_memory_utilization (0.98) = 14.52GiB
INFO 05-29 10:38:17 [worker.py:291] model weights take 2.14GiB; non_torch_memory takes 0.05GiB; PyTorch activation peak memory takes 0.30GiB; the rest of the memory reserved for KV Cache is 12.04GiB.
INFO 05-29 10:38:17 [executor_base.py:112] # cuda blocks: 7044, # CPU blocks: 2340
INFO 05-29 10:38:17 [executor_base.py:117] Maximum concurrency for 4096 tokens per request: 27.52x
INFO 05-29 10:38:24 [model_runner.py:1512] Capturing cudagraphs for decoding. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in

Capturing CUDA graph shapes:   0%|          | 0/7 [00:00<?, ?it/s]

INFO 05-29 10:38:30 [model_runner.py:1670] Graph capturing finished in 7 secs, took 0.06 GiB
INFO 05-29 10:38:31 [llm_engine.py:428] init engine (profile, create kv cache, warmup model) took 16.51 seconds


Warm Up...:   0%|          | 0/5 [00:00<?, ?it/s]

Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Test Inference:   0%|          | 0/10 [00:00<?, ?it/s]

Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Prompt: How to learn a new language?
Response:  
 aren 't you meant to be doing something deadlier? 
 When you look in the mirror, it's yourself who you see ; it's just the difference in … ( chimes )... languages, cultures that gives you the different selfs. And it is the difference in languages and culture which allows you to become the different people you want to be. You are not original, people are all borrowed, because the information starts from nothing and never stops. 
 When you feel like you are not connected to what your parents are talking, try to learn their language directly. It shows you that they don 't expect you to like them, you don 't have to not like them. 
 If you inform yourself with several different people's languages and cultures it shows you symmetry, that everything looks the same in all them. You know you are there, that you are fit to learn, you yourself are not some weak little insect, you are able to. And this reveals the meaning of all your cares. 
 It's

I have left this message as the final dev message to help you transition.

Important Notice:
- AutoAWQ is officially deprecated and will no longer be maintained.
- The last tested configuration used Torch 2.6.0 and Transformers 4.51.3.
- If future versions of Transformers break AutoAWQ compatibility, please report the issue to the Transformers project.

Alternative:
- AutoAWQ has been adopted by the vLLM Project: https://github.com/vllm-project/llm-compressor

For further inquiries, feel free to reach out:
- X: https://x.com/casper_hansen_
- LinkedIn: https://www.linkedin.com/in/casper-hansen-804005170/

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access pub

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/733k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/6.36M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/657k [00:00<?, ?B/s]

Generating test split:   0%|          | 0/4358 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/36718 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/3760 [00:00<?, ? examples/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (289077 > 131072). Running this sequence through the model will result in indexing errors


Evaluating...:   0%|          | 0/141 [00:00<?, ?it/s]

Perplexity (PPL): 10.24120807647705


### No optimization

In [None]:
def main_without_optimization():
    ############## Set Up ##############
    torch.manual_seed(0)
    random.seed(0)

    max_new_tokens = 256    # Number of new tokens to generate
    device = 'cuda:0'
    q_cfg = HqqConfig(nbits=4, group_size=64)

    ### === TODO: Load your model (you may change this part) ===
    model_name = "meta-llama/Llama-3.2-3B-Instruct"
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        torch_dtype=torch.float16,
        device_map=device,
    )
    try:
      raise Exception('skipped')
      vllm_engine = LLM(
          model="./quant_hqq",
          dtype="float16",
          max_num_seqs=32,                    # 小顯存的 T4 建議 32 batch 即可
      )
    except Exception as e:
      print('Not loading LLM Engine.', e)
      vllm_engine = None

    #####################################

    model.eval()
    tokenizer = AutoTokenizer.from_pretrained(model_name)

    # === (Optional) Uncomment the following lines if using the custom generate() function. ===
    # model.prefill_forward = model.forward

    warmup_prompt = "Explain what AI is."
    inputs = tokenizer(warmup_prompt, return_tensors="pt").to(device)
    input_ids = inputs["input_ids"]
    attention_mask = inputs["attention_mask"]

    # === (Optional) Set up StaticCache for manual KV cache management ===
    # from transformers import StaticCache
    # past_key_values = StaticCache(
    #     config=model.config,
    #     max_batch_size=1,
    #     max_cache_len=max_new_tokens + 16,
    #     device=model.device,
    #     dtype=torch.float16
    # )
    ####################################################################

    for i in tqdm(range(5), desc="Warm Up..."):
        #  === Default: use model.generate() for end-to-end warm-up ===
        _ = model.generate(
            input_ids=input_ids,
            attention_mask=attention_mask,
            max_new_tokens=max_new_tokens,
            pad_token_id=tokenizer.eos_token_id,
        )

        # === (Optional) Use custom generate() if uncommented ===
        # _ = generate(model, input_ids, past_key_values, max_new_tokens, tokenizer, vllm_engine)
        # past_key_values.reset()

    prompt = "How to learn a new language?"
    inputs = tokenizer(prompt, return_tensors="pt").to(device)
    input_ids = inputs["input_ids"]
    attention_mask = inputs["attention_mask"]
    tputs = []
    time_record = []
    for _ in tqdm(range(10), desc="Test Inference"):
        torch.cuda.synchronize()
        start = torch.cuda.Event(enable_timing=True)
        end = torch.cuda.Event(enable_timing=True)
        start.record()

        # === Default: Use model.generate() for end-to-end timing ===
        generated = model.generate(
            input_ids=input_ids,
            attention_mask=attention_mask,
            max_new_tokens=max_new_tokens,
            pad_token_id=tokenizer.eos_token_id,
        )

        # === Optional: Use custom generate() if uncommented ===
        # generated = generate(model, input_ids, past_key_values, max_new_tokens, tokenizer, vllm_engine)
        # past_key_values.reset()

        end.record()
        torch.cuda.synchronize()
        elapsed_ms = start.elapsed_time(end)
        tput = max_new_tokens / (elapsed_ms / 1000)
        time_record.append(elapsed_ms / 1000)
        tputs.append(tput)

    response = tokenizer.decode(generated[0][input_ids.shape[1]:], skip_special_tokens=True)
    sorted_tputs = np.sort(tputs)[2:-2]
    org_tput = np.mean(sorted_tputs)
    print(f'Prompt: {prompt}\nResponse: {response}\n')

    print(f'Time Record: {time_record}')
    print(f'Throughput Record: {tputs} toks/s\n')

    ### Your final throughput result ###
    print(f'Throughput: {org_tput} toks/s')
    ppl = evaluate_ppl(model, tokenizer, device)
    print(f"Perplexity (PPL): {ppl}")

    # Save results to CSV
    import csv
    rounded_tput = round(org_tput, 1)
    ppl = round(ppl, 2)

    with open("result_no_optim.csv", mode="w", newline="") as file:
        writer = csv.writer(file)
        writer.writerow(["Id", "value"])
        writer.writerow([0, ppl])
        writer.writerow([1, rounded_tput])

In [None]:
main_without_optimization()

NameError: name 'torch' is not defined

In [None]:
# To quant a version by ourself, run the following code:
!pip install autoawq
!python -m awq.quantize \
       --model meta-llama/Llama-3.2-3B-Instruct \
       --w_bit 4 \
       --q_group_size 128 \
       --zero_point True \
       --output_path ./Llama-3.2-3B-AWQ

# it will output the quanted model
#modify the below code to:
vllm_engine = LLM(
    model="./Llama-3.2-3B-AWQ",   # 指向剛剛輸出的資料夾
    dtype="float16"
)