In [None]:
!pip install -q transformers accelerate bitsandbytes
!pip install -q einops

In [None]:
from huggingface_hub import login
login(token="YOUR_TOKEN_HERE")

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

model_id = "meta-llama/Llama-2-7b-hf"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

model = AutoModelForCausalLM.from_pretrained(
    model_id,
    quantization_config=bnb_config,
    device_map="auto"
)

tokenizer = AutoTokenizer.from_pretrained(model_id)

In [None]:
import time

tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

messages = [
    {"role": "user", "content": "What is the capital of France? Answer with only the city name."}
]

prompt = tokenizer.apply_chat_template(
    messages,
    tokenize=False,
    add_generation_prompt=True
)

input_ids = tokenizer.encode(prompt, return_tensors="pt").to("cuda")

max_new_tokens = 100
current_input_ids = input_ids
attention_mask = torch.ones_like(input_ids).long().to("cuda")

past_key_values = None
start_time = None
first_token_time = None
output_ids = input_ids.clone()
generated_tokens = 0

GPU_COST_PER_HOUR = 2.93

model.eval()

with torch.no_grad():
    for i in range(max_new_tokens):

        if i == 0:
            torch.cuda.synchronize()
            start_time = time.time()

        outputs = model(
            current_input_ids,
            past_key_values=past_key_values,
            attention_mask=attention_mask,
            use_cache=True
        )

        logits = outputs.logits[:, -1, :]

        next_token_id = torch.argmax(logits, dim=-1).unsqueeze(-1)

        if i == 0:
            torch.cuda.synchronize()
            first_token_time = time.time()
            ttft = first_token_time - start_time
            print(f"**Time to First Token (TTFT): {ttft:.4f} seconds**")

        if next_token_id.item() == tokenizer.eos_token_id:
            break

        output_ids = torch.cat([output_ids, next_token_id], dim=-1)

        current_input_ids = next_token_id

        new_attention_mask = torch.ones((1, 1), dtype=torch.long, device='cuda')
        attention_mask = torch.cat([attention_mask, new_attention_mask], dim=1)

        past_key_values = outputs.past_key_values
        generated_tokens += 1

torch.cuda.synchronize()
end_time = time.time()
total_gen_time = end_time - first_token_time

throughput = generated_tokens / total_gen_time if generated_tokens > 0 else 0
overall_throughput = generated_tokens / (end_time - start_time)

cost_per_second = GPU_COST_PER_HOUR / 3600
cost_per_token = cost_per_second / throughput if throughput > 0 else float("inf")
sequence_cost = cost_per_token * generated_tokens

response = tokenizer.decode(output_ids[0], skip_special_tokens=True)

print("\n--- Model Response ---")
print(response)
print(f"Total tokens generated: {generated_tokens}")
print(f"Total generation time: {total_gen_time:.4f}s")
print(f"Overall throughput (incl TTFT): {overall_throughput:.2f} tokens/s")
print(f"Steady-state throughput: {throughput:.2f} tokens/s")
print(f"Cost per token: ${cost_per_token:.8f}")
print(f"Total sequence cost: ${sequence_cost:.8f}")

LATENCY BOUND TESTING

In [None]:
import time
import torch
import csv
from datetime import datetime

def latency_bound_test(model, tokenizer, message, max_new_tokens=512):
    prompt = tokenizer.apply_chat_template(
        message,
        tokenize=False,
        add_generation_prompt=True
    )

    input_ids = tokenizer.encode(prompt, return_tensors="pt").to("cuda")
    attention_mask = torch.ones_like(input_ids).long().to("cuda")

    current_input_ids = input_ids
    past_key_values = None

    output_ids = input_ids.clone()
    generated_tokens = 0

    model.eval()

    ttft = None
    start_time = None
    first_token_time = None

    STOP_TOKENS = {
        tokenizer.eos_token_id,
        tokenizer.pad_token_id,
        tokenizer.bos_token_id,
    }

    with torch.no_grad():
        for i in range(max_new_tokens):

            if i == 0:
                torch.cuda.synchronize()
                start_time = time.time()

            outputs = model(
                current_input_ids,
                past_key_values=past_key_values,
                attention_mask=attention_mask,
                use_cache=True,
            )

            logits = outputs.logits[:, -1, :]
            next_token_id = torch.argmax(logits, dim=-1).unsqueeze(-1)

            if i == 0:
                torch.cuda.synchronize()
                first_token_time = time.time()
                ttft = first_token_time - start_time

            if next_token_id.item() in STOP_TOKENS:
                next_token_id = torch.tensor(
                    [[tokenizer.encode("a", add_special_tokens=False)[0]]],
                    device="cuda"
                )

            output_ids = torch.cat([output_ids, next_token_id], dim=-1)
            generated_tokens += 1

            current_input_ids = next_token_id

            new_attention_mask = torch.ones((1, 1), dtype=torch.long, device="cuda")
            attention_mask = torch.cat([attention_mask, new_attention_mask], dim=1)

            past_key_values = outputs.past_key_values

    torch.cuda.synchronize()
    end_time = time.time()

    total_gen_time = end_time - first_token_time
    throughput = generated_tokens / total_gen_time

    return {
        "generated_tokens": generated_tokens,
        "ttft": ttft,
        "gen_time": total_gen_time,
        "throughput": throughput,
        "output": tokenizer.decode(output_ids[0], skip_special_tokens=True),
    }

In [None]:
sizes = [256, 512, 1024]

messages = [
    {"name": "simple_qa", "messages": [
        {"role": "user", "content": "What is the capital of France? Answer with only the city name."}
    ]},
    {"name": "reasoning", "messages": [
        {"role": "user", "content": "Let's think step-by-step. If John is taller than Mark, and Mark is shorter than Sue, is John definitely taller than Sue? Answer 'Yes', 'No', or 'Cannot determine'."}
    ]},
    {"name": "sentiment_analysis", "messages": [
        {"role": "user", "content": "Classify the sentiment of the text as 'Positive', 'Negative', or 'Neutral'. Text: The service was quick and the food was delicious. Sentiment: Positive. Text: The package arrived late and the box was damaged. Sentiment: Negative. Text: The meeting ended on time. Sentiment: Neutral. Text: I finished the book but found the ending disappointing.Sentiment: [FILL IN HERE]"}
    ]},
    {"name": "summarization", "messages": [
        {"role": "user", "content": "You are an expert summarizer. Your goal is to write a single-paragraph, abstractive summary of the provided text, focusing on the main argument and conclusion. The summary must be brief, no more than 75 words. Use this article: https://en.wikipedia.org/wiki/Graphics_processing_unit"}
    ]},
]

for seq_len in sizes:
  for message in messages:
    print(f"Testing {seq_len} Tokens On Prompt {message["name"]}")
    r = latency_bound_test(model, tokenizer, message["messages"], max_new_tokens=seq_len)

    print(f"Generated: {r['generated_tokens']} Tokens")
    print(f"TTFT: {r['ttft']:.4f}s")
    print(f"Generation Time: {r['gen_time']:.4f}s")
    print(f"Throughput: {r['throughput']:.2f} tokens/sec")
    print(f"Output: {r['output']}")