In [None]:
import os
os.environ["HF_TOKEN"] = "<hf-api-key>"

In [None]:
import torch
import time
from vllm import LLM, SamplingParams
from transformers import AutoTokenizer

# Initialize tokenizer separately to apply chat template
MODEL_NAME = "meta-llama/Llama-3.2-3B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
tokenizer.pad_token = tokenizer.eos_token  # Just in case

# Define a batch of prompts (32 in total)
user_questions = [
    "What is Thailand's national food symbol?",
    "What is the capital of Norway?",
    "Who invented the telescope?",
    "What is the chemical symbol for gold?",
    # "Tell me a fun fact about dolphins.",
    # "How many continents are there on Earth?",
    # "What is the speed of light in vacuum?",
    # "Who wrote the play 'Hamlet'?",
    #"What causes tides in the ocean?",
    #"Name the largest planet in our solar system.",
    #"What language is primarily spoken in Brazil?",
    #"What's the boiling point of water in Celsius?",
    #"Who painted the Mona Lisa?",
    #"What is the main ingredient in guacamole?",
    #"Why do cats purr?",
    #"What is the tallest mountain in the world?",
] * 2

# Create structured chat messages
messages_batch = [
    [
        {"role": "system", "content": "You are a helpful assistant. Respond in two sentences."},
        {"role": "user", "content": question}
    ]
    for question in user_questions
]

# Manually apply chat template to each prompt
rendered_prompts = [
    tokenizer.apply_chat_template(messages, tokenize=False)
    for messages in messages_batch
]

# Define generation parameters
sampling_params = SamplingParams(max_tokens=128)

# Clear GPU cache
torch.cuda.empty_cache()

# Initialize vLLM
llm = LLM(
    model=MODEL_NAME,
    max_model_len=2048,
)

# Start timing
start_time = time.time()

# Run batch generation
outputs = llm.generate(rendered_prompts, sampling_params)

# End timing
end_time = time.time()

# Print outputs
print("--- Batch Outputs ---")
for i, output in enumerate(outputs):
    print(f"\nPrompt {i+1}: {user_questions[i]}")
    print(f"Response {i+1}: {output.outputs[0].text.strip()}")

# Report inference time
inference_time = end_time - start_time
print(f"\n⏱ Inference Time (batch of {len(user_questions)}): {inference_time:.2f} seconds")


  from .autonotebook import tqdm as notebook_tqdm


INFO 05-20 11:09:31 [__init__.py:239] Automatically detected platform cuda.


2025-05-20 11:09:33,636	INFO util.py:154 -- Missing packages: ['ipywidgets']. Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.


INFO 05-20 11:09:43 [config.py:717] This model supports multiple tasks: {'embed', 'classify', 'generate', 'reward', 'score'}. Defaulting to 'generate'.
INFO 05-20 11:09:43 [config.py:2003] Chunked prefill is enabled with max_num_batched_tokens=8192.
INFO 05-20 11:09:45 [core.py:58] Initializing a V1 LLM engine (v0.8.5.post1) with config: model='meta-llama/Llama-3.2-3B-Instruct', speculative_config=None, tokenizer='meta-llama/Llama-3.2-3B-Instruct', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=2048, download_dir=None, load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto,  device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='auto', reasoning_backend=None), observability_config=ObservabilityConfig(show_hidden_metrics=False, otlp_tr

Loading safetensors checkpoint shards:   0% Completed | 0/2 [00:00<?, ?it/s]
Loading safetensors checkpoint shards:  50% Completed | 1/2 [00:01<00:01,  1.28s/it]
Loading safetensors checkpoint shards: 100% Completed | 2/2 [00:01<00:00,  1.26it/s]
Loading safetensors checkpoint shards: 100% Completed | 2/2 [00:01<00:00,  1.16it/s]



INFO 05-20 11:09:49 [loader.py:458] Loading weights took 1.76 seconds
INFO 05-20 11:09:49 [gpu_model_runner.py:1347] Model loading took 6.0160 GiB and 3.119919 seconds
INFO 05-20 11:09:59 [backends.py:420] Using cache directory: /home/jupyter/.cache/vllm/torch_compile_cache/bdc21dc8c4/rank_0_0 for vLLM's torch.compile
INFO 05-20 11:09:59 [backends.py:430] Dynamo bytecode transform time: 9.39 s
INFO 05-20 11:10:05 [backends.py:118] Directly load the compiled graph(s) for shape None from the cache, took 6.119 s
INFO 05-20 11:10:07 [monitor.py:33] torch.compile takes 9.39 s in total
INFO 05-20 11:10:08 [kv_cache_utils.py:634] GPU KV cache size: 114,816 tokens
INFO 05-20 11:10:08 [kv_cache_utils.py:637] Maximum concurrency for 2,048 tokens per request: 56.06x
INFO 05-20 11:10:33 [gpu_model_runner.py:1686] Graph capturing finished in 25 secs, took 0.44 GiB
INFO 05-20 11:10:33 [core.py:159] init engine (profile, create kv cache, warmup model) took 43.62 seconds
INFO 05-20 11:10:33 [core_clie

Processed prompts: 100%|██████████| 4/4 [00:02<00:00,  1.68it/s, est. speed input: 83.80 toks/s, output: 94.70 toks/s]

--- Batch Outputs ---

Prompt 1: What is Thailand's national food symbol?
Response 1: assistant

Based on its rich culinary culture, Thailand's national food symbol is the Congee, also known as tom yum na kanom jeen, a type of coconut milk-based soup, but another symbol is sticky rice or glutinous rice. However, a more commonly accepted symbol is the Mango, especially as in ' sticky rice and mango', as featured on Thailand's architectural and national emblems.

Prompt 2: What is the capital of Norway?
Response 2: assistant

The capital of Norway is Oslo. It is the country's largest city and is situated on the southern part of the country, along the Vimmerbumerelvi river.

Prompt 3: Who invented the telescope?
Response 3: assistant

The invention of the telescope is credited to Hans Lippershey, a Dutch spectacle maker, and Zacharias Janssen, another Dutch spectacle maker, who respectively received patents for their designs in 1608. However, it was Galileo Galilei, an Italian astronomer,




| Batch Size | Inference Time (seconds) |
|------------|--------------------------|
| 32         | 3.38                     |
| 16         | 3.01                     |
| 8          | 2.51                     |
| 4          | 2.32                     |

In [None]:
import torch
import time
from transformers import AutoModelForCausalLM, AutoTokenizer

MODEL_NAME = "meta-llama/Llama-3.2-3B-Instruct"

# Load model and tokenizer
model = AutoModelForCausalLM.from_pretrained(MODEL_NAME, device_map="auto")
tokenizer = AutoTokenizer.from_pretrained(
    MODEL_NAME,
    padding_side="left",
    truncation_side="left"
)

# Set pad_token (LLaMA doesn't define one by default)
tokenizer.pad_token = tokenizer.eos_token

# Define batch of user prompts
user_questions = [
    "What is Thailand's national food symbol?",
    "What is the capital of Norway?",
    "Who invented the telescope?",
    "What is the chemical symbol for gold?",
    # "Tell me a fun fact about dolphins.",
    # "How many continents are there on Earth?",
    # "What is the speed of light in vacuum?",
    # "Who wrote the play 'Hamlet'?",
    #"What causes tides in the ocean?",
    #"Name the largest planet in our solar system.",
    #"What language is primarily spoken in Brazil?",
    #"What's the boiling point of water in Celsius?",
    #"Who painted the Mona Lisa?",
    #"What is the main ingredient in guacamole?",
    #"Why do cats purr?",
    #"What is the tallest mountain in the world?",
] * 2

# Format messages in chat format
messages_batch = [
    [
        {"role": "system", "content": "You are a helpful assistant. Respond in two sentences."},
        {"role": "user", "content": question}
    ]
    for question in user_questions
]

# Tokenize using chat template with padding and truncation
input_ids = tokenizer.apply_chat_template(
    messages_batch,
    tokenize=True,
    return_tensors="pt",
    padding=True,
    truncation=True,
    max_length=2048
).to(model.device)

# Clear previous GPU stats
torch.cuda.empty_cache()
torch.cuda.reset_peak_memory_stats()

# Start timing
start_time = time.time()

# Generate responses
input_length = input_ids.shape[1]
generated_ids = model.generate(
    input_ids=input_ids,
    max_new_tokens=128,
    pad_token_id=tokenizer.pad_token_id
)

# End timing
end_time = time.time()

# Decode outputs
decoded_outputs = tokenizer.batch_decode(
    generated_ids[:, input_length:],
    skip_special_tokens=True
)

# Print outputs
print("--- Batch Outputs ---")
for i, output in enumerate(decoded_outputs):
    print(f"\nPrompt {i+1}: {user_questions[i]}")
    print(f"Response {i+1}: {output.strip()}")

# Report performance
inference_time = end_time - start_time
peak_memory = torch.cuda.max_memory_allocated() / (1024 ** 2)  # in MB

print(f"\n⏱ Inference Time (batch of 5): {inference_time:.2f} seconds")
print(f"📊 Peak GPU Memory: {peak_memory:.2f} MB")


  from .autonotebook import tqdm as notebook_tqdm
Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.07it/s]
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


--- Batch Outputs ---

Prompt 1: What is Thailand's national food symbol?
Response 1: assistant

The national food symbol of Thailand is the "khao niew" or sticky rice, which is typically served with most Thai meals. It is a staple food in Thai cuisine and is often served alongside dishes such as curries, soups, and grilled meats.

Prompt 2: What is the capital of Norway?
Response 2: assistant

The capital of Norway is Oslo. It is located in the southeastern part of the country and is known for its cultural and historical landmarks, such as the Opera House and the Viking Ship Museum.

Prompt 3: Who invented the telescope?
Response 3: assistant

The invention of the telescope is credited to Hans Lippershey, a Dutch spectacle maker, who patented the first practical refracting telescope in 1608. However, Galileo Galilei is often credited with the first practical refracting telescope in 1609, as he improved upon Lippershey's design and made significant contributions to the field of astrono

| Batch Size | Inference Time (seconds) |
|------------|--------------------------|
| 32         | 12.90                    |
| 16         | 9.50                     |
| 8          | 7.24                     |
| 4          | 5.32                     |