In [None]:
import os
import pandas as pd
from tqdm import tqdm
from vllm import LLM, SamplingParams
from transformers import AutoTokenizer
# from llmcompressor.transformers import SparseAutoModelForCausalLM

In [None]:
model_ids = [
    'meta-llama/Llama-3.2-1B-Instruct',
    'neuralmagic/Llama-3.2-1B-Instruct-quantized.w8a8',
    'ciCic/llama-3.2-1B-Instruct-AWQ',
]

messages = [
    'The rain tapped gently against the window pane.',
    'He found comfort in the rhythm of the ocean waves.',
    'The children laughed as they played in the park.',
    'A soft breeze carried the scent of blooming flowers.',
    'She painted with colors that seemed to dance on the canvas.',
    'The cat curled up in the warm sunlight.',
    'He took a deep breath and dove into the clear blue water.',
    'The music filled the room with a peaceful ambiance.',
    'They watched the stars, imagining distant worlds.',
    'Her smile lit up the room like the morning sun.',
    'The forest was alive with the sounds of birds and insects.',
    'He wrote letters that he never intended to send.',
    'The coffee shop was a haven of warmth and quiet.',
    'She spent the afternoon tending to her garden.',
    'The snowflakes fell softly, covering everything in white.',
    'He dreamed of traveling to distant lands.',
    'The smell of freshly baked bread filled the air.',
    'She wore a necklace that shimmered in the light.',
    'The old man told stories of his youth by the fireplace.',
    'They danced under the moonlight, their laughter echoing.',
    'The library was a treasure trove of forgotten knowledge.',
    'He watched the sunrise with a sense of hope.',
    'The car sped down the open road, the wind in their hair.',
    'She captured moments with her camera, freezing time.',
    'The city buzzed with life, even late at night.',
    'He carved wooden figures with intricate detail.',
    'The waves crashed against the rocky shore.',
    'Her voice was soft, yet carried a powerful message.',
    'The fire crackled, warming the cold night air.',
    'They walked hand in hand through the bustling market.',
    'The clouds parted, revealing a brilliant blue sky.',
    'She spent hours writing in her journal, lost in thought.',
    'The clock ticked loudly in the silent room.',
    'He built models of ships that sailed the seven seas.',
    'The garden was full of vibrant, colorful flowers.',
    'She stood on the bridge, watching the river flow beneath her.',
    "The chef prepared dishes with an artist's touch.",
    'He played the piano with passion and skill.',
    'The train moved steadily through the countryside.',
    'She found beauty in the simplest of things.',
    'The mountain trail wound through tall pine trees.',
    'They watched the sunset, their hearts full of peace.',
    'The old house creaked in the wind, full of memories.',
    'He studied the stars, fascinated by the night sky.',
    'The festival was alive with color, music, and joy.',
    'She practiced yoga by the lake, finding her balance.',
    'The clock struck midnight, marking the end of the year.',
    'He sketched portraits of people in the crowded café.',
    'The forest floor was carpeted with fallen leaves.',
    'She found solace in the quiet moments of the day.',
    'The book was filled with tales of adventure and mystery.'
]

In [None]:
mid = 1
tokenizer = AutoTokenizer.from_pretrained(model_ids[mid])
sampling_params = SamplingParams(
    temperature=0.8,
    top_k=100,
    max_tokens=1024,
)
# llm = SparseAutoModelForCausalLM.from_pretrained(
#     model_ids[mid], device_map="auto", torch_dtype="auto",
# )
llm = LLM(
    model=model_ids[mid],
    # dtype='bfloat16',
    # quantization="awq_marlin"
)

llm.llm_engine.model_config.dtype

In [None]:
def get_runtime_metrics(request_output):
    metrics = {}
    metrics['time_to_first_token'] = request_output.metrics.first_token_time - request_output.metrics.first_scheduled_time
    metrics['time_to_last_token'] =  request_output.metrics.finished_time - request_output.metrics.first_scheduled_time
    metrics['input_tokens'] = len(request_output.prompt_token_ids)
    metrics['output_tokens'] = len(request_output.outputs[0].token_ids)
    metrics['total_tokens'] = metrics['input_tokens'] + metrics['output_tokens']
    metrics['toks_per_s'] = metrics['total_tokens']/metrics['time_to_last_token']

    return metrics

history = []

for idx in range(len(messages)):
    temp = messages[:idx+1]
    temp = ' '.join(temp)

    ip = [
        {"role": "system", "content": "You always complete the story in 200 words."},
        {"role": "user", "content": temp},
    ]
    prompts = tokenizer.apply_chat_template(ip, add_generation_prompt=True, tokenize=False)

    output = llm.generate(prompts, sampling_params=sampling_params)
    history.append(get_runtime_metrics(output[0]))

metrics = pd.DataFrame.from_records(history)

In [None]:
metrics

In [None]:
metrics.to_csv(model_ids[mid].replace('/', '__') + '.csv')

---

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
unquant = pd.read_csv('meta-llama__Llama-3.2-1B-Instruct.csv').sort_values('total_tokens')#.iloc[:10]
awq = pd.read_csv('ciCic__llama-3.2-1B-Instruct-AWQ.csv').sort_values('total_tokens')#.iloc[:10]
w8a8 = pd.read_csv('neuralmagic__Llama-3.2-1B-Instruct-quantized.w8a8.csv').sort_values('total_tokens')#.iloc[:10]

In [None]:
plt.figure()
plt.plot(unquant['total_tokens'], unquant['toks_per_s'], label='Unquantized')
plt.plot(awq['total_tokens'], awq['toks_per_s'], label='AWQ')
plt.plot(w8a8['total_tokens'], w8a8['toks_per_s'], label='W8A8')

plt.xlabel('Total Tokens')
plt.ylabel('Tokens per s')
plt.legend()
plt.show()