<a href="https://colab.research.google.com/github/anushadudi/inference_latency_optimization/blob/main/quant_4bnb_bench.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install accelerate bitsandbytes>=0.45.0 vllm huggingface-hub torch


In [None]:
from vllm import LLM, SamplingParams
from huggingface_hub import login
import time
import pandas as pd
from google.colab import userdata
import torch
questions = [
    # Coding questions
    "Implement a Python function to compute the Fibonacci numbers.",
    "Write a Rust function that performs binary exponentiation.",
    "What are the differences between Javascript and Python?",
    # Literature
    "Write a story in the style of James Joyce about a trip to the Australian outback in 2083, to see robots in the beautiful desert.",
    "Who does Harry turn into a balloon?",
    "Write a tale about a time-traveling historian who's determined to witness the most significant events in human history.",
    # Math
    "What is the product of 9 and 8?",
    "If a train travels 120 kilometers in 2 hours, what is its average speed?",
    "Think through this step by step. If the sequence a_n is defined by a_1 = 3, a_2 = 5, and a_n = a_(n-1) + a_(n-2) for n > 2, find a_6.",
]


def initiateModel():
  model_id="unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit"
  login(token=userdata.get('HF_TOKEN'))
  return LLM(model=model_id, dtype=torch.bfloat16, trust_remote_code=True, quantization="bitsandbytes", load_format="bitsandbytes")


INFO 02-19 06:39:30 __init__.py:190] Automatically detected platform cuda.


In [None]:
def generate(question, llm):
  sampling_params = SamplingParams(temperature=0.01, top_p=0.01, max_tokens=200)

  start = time.perf_counter()
  result = llm.generate(question, sampling_params)
  request_time = time.perf_counter() - start
  response = {'question': question}
  for output in result:
      response['tok_count'] = len(output.outputs[0].token_ids)
      response['time'] = request_time
      response['answer'] = output.outputs[0].text
      response['tokens_per_second'] = len(output.outputs[0].token_ids) / request_time
      response['ms_per_seq_output_token'] = request_time * 1000 / len(output.outputs[0].token_ids)
      response['time_to_first_token'] = output.metrics.first_token_time - output.metrics.arrival_time
      response['metrics'] = output.metrics
  return response

In [None]:
def run_benchmark(llm):
    counter = 1
    responses = []

    for q in questions:
        response = generate(question=q, llm=llm)
        if counter >= 2:
            responses.append(response)
        counter += 1

    df = pd.DataFrame(responses)
    df.to_csv('bench-vllm-quant.csv', index=False)
    return df

In [None]:
llm = initiateModel()


config.json:   0%|          | 0.00/1.53k [00:00<?, ?B/s]

INFO 02-19 06:39:46 config.py:542] This model supports multiple tasks: {'reward', 'embed', 'score', 'generate', 'classify'}. Defaulting to 'generate'.
INFO 02-19 06:39:47 config.py:1556] Chunked prefill is enabled with max_num_batched_tokens=2048.
INFO 02-19 06:39:47 llm_engine.py:234] Initializing a V0 LLM engine (v0.7.2) with config: model='unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit', speculative_config=None, tokenizer='unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, tokenizer_revision=None, trust_remote_code=True, dtype=torch.bfloat16, max_seq_len=131072, download_dir=None, load_format=LoadFormat.BITSANDBYTES, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=bitsandbytes, enforce_eager=False, kv_cache_dtype=auto,  device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='xgrammar'), observability_config=ObservabilityConfig(otlp_t

tokenizer_config.json:   0%|          | 0.00/55.5k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/454 [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

INFO 02-19 06:39:49 cuda.py:230] Using Flash Attention backend.
INFO 02-19 06:39:50 model_runner.py:1110] Starting to load model unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit...
INFO 02-19 06:39:50 loader.py:1102] Loading weights with BitsAndBytes quantization.  May take a while ...
INFO 02-19 06:39:50 weight_utils.py:252] Using model weights format ['*.safetensors']


model.safetensors:   0%|          | 0.00/5.70G [00:00<?, ?B/s]

Loading safetensors checkpoint shards:   0% Completed | 0/1 [00:00<?, ?it/s]


Loading safetensors checkpoint shards:   0% Completed | 0/1 [00:00<?, ?it/s]


INFO 02-19 06:42:10 model_runner.py:1115] Loading model weights took 5.3424 GB
INFO 02-19 06:42:11 worker.py:267] Memory profiling takes 1.01 seconds
INFO 02-19 06:42:11 worker.py:267] the current vLLM instance can use total_gpu_memory (39.56GiB) x gpu_memory_utilization (0.90) = 35.60GiB
INFO 02-19 06:42:11 worker.py:267] model weights take 5.34GiB; non_torch_memory takes 0.09GiB; PyTorch activation peak memory takes 1.19GiB; the rest of the memory reserved for KV Cache is 28.98GiB.
INFO 02-19 06:42:12 executor_base.py:110] # CUDA blocks: 14837, # CPU blocks: 2048
INFO 02-19 06:42:12 executor_base.py:115] Maximum concurrency for 131072 tokens per request: 1.81x
INFO 02-19 06:42:14 model_runner.py:1434] Capturing cudagraphs for decoding. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI. If out-of-memory error occurs during cudagraph capture, consider decreasing `gpu_memory_u

Capturing CUDA graph shapes: 100%|██████████| 35/35 [00:30<00:00,  1.14it/s]

INFO 02-19 06:42:44 model_runner.py:1562] Graph capturing finished in 31 secs, took 0.64 GiB
INFO 02-19 06:42:44 llm_engine.py:431] init engine (profile, create kv cache, warmup model) took 34.59 seconds





In [None]:
result = run_benchmark(llm)
result

Processed prompts: 100%|██████████| 1/1 [00:02<00:00,  2.75s/it, est. speed input: 4.00 toks/s, output: 72.65 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:02<00:00,  2.65s/it, est. speed input: 4.15 toks/s, output: 75.49 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:02<00:00,  2.65s/it, est. speed input: 3.77 toks/s, output: 75.44 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:02<00:00,  2.66s/it, est. speed input: 11.68 toks/s, output: 75.32 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:02<00:00,  2.65s/it, est. speed input: 3.39 toks/s, output: 75.42 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:02<00:00,  2.65s/it, est. speed input: 9.04 toks/s, output: 75.37 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:02<00:00,  2.66s/it, est. speed input: 4.51 toks/s, output: 75.15 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:02<00:00,  2.66s/it, est. speed input: 7.16 toks/s, output: 75.32 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:02<00:00,  

Unnamed: 0,question,tok_count,time,answer,tokens_per_second,ms_per_seq_output_token,time_to_first_token,metrics
0,Write a Rust function that performs binary exp...,200,2.65278,Binary exponentiation is a method for quickly...,75.392604,13.2639,0.071113,"RequestMetrics(arrival_time=1739947367.778117,..."
1,What are the differences between Javascript an...,200,2.654409,"Javascript is a high-level, dynamic, and inte...",75.34634,13.272045,0.071313,"RequestMetrics(arrival_time=1739947370.430931,..."
2,Write a story in the style of James Joyce abou...,200,2.658717,"\nThe desert stretched out before us, a vast ...",75.224253,13.293585,0.074211,RequestMetrics(arrival_time=1739947373.0853832...
3,Who does Harry turn into a balloon?,200,2.655115,?\nHarry Potter and the Philosopher's Stone (p...,75.326303,13.275575,0.074538,RequestMetrics(arrival_time=1739947375.7441285...
4,Write a tale about a time-traveling historian ...,200,2.657508,"She's got a time machine, but it's temperamen...",75.258471,13.287541,0.073861,RequestMetrics(arrival_time=1739947378.3992631...
5,What is the product of 9 and 8?,200,2.664917,"[ #permalink ] New post Feb 9, 2005\nWhat is ...",75.049252,13.324583,0.073195,RequestMetrics(arrival_time=1739947381.0568027...
6,"If a train travels 120 kilometers in 2 hours, ...",200,2.659092,Average speed is calculated by dividing the d...,75.213643,13.29546,0.074841,RequestMetrics(arrival_time=1739947383.7217417...
7,Think through this step by step. If the sequen...,200,2.655037,(This is a classic example of a recursive seq...,75.328512,13.275186,0.071566,RequestMetrics(arrival_time=1739947386.3808396...


In [None]:
!cp bench-vllm-quant.csv drive/MyDrive/Colab\ Notebooks/files/bench-vllm-quant2.csv


cp: cannot create regular file 'drive/MyDrive/Colab\ Notebooks/files/bench-vllm-quant2.csv': No such file or directory


https://docs.vllm.ai/en/latest/features/quantization/bnb.html