In [None]:
!pip install -q transformers accelerate bitsandbytes
!pip install -q einops

In [None]:
from huggingface_hub import login
login(token="YOUR_TOKEN_HERE")

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

model_id = "meta-llama/Llama-2-7b-hf"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

model = AutoModelForCausalLM.from_pretrained(
    model_id,
    quantization_config=bnb_config,
    device_map="auto"
)

tokenizer = AutoTokenizer.from_pretrained(model_id)

In [None]:
import time

tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "left"

prompt = "The secret to happiness is"
input_ids = tokenizer.encode(prompt, return_tensors="pt").to('cuda')

max_new_tokens = 100
current_input_ids = input_ids
attention_mask = input_ids.ne(tokenizer.pad_token_id).long().to('cuda')
past_key_values = None
start_time = None
first_token_time = None
output_ids = input_ids.clone()
generated_tokens = 0

model.eval()

with torch.no_grad():
    for i in range(max_new_tokens):
        if i == 0:
            torch.cuda.synchronize()
            start_time = time.time()
        
        outputs = model(
            current_input_ids,
            past_key_values=past_key_values,
            attention_mask=attention_mask,
            use_cache=True
        )

        logits = outputs.logits[:, -1, :]
        
        next_token_id = torch.argmax(logits, dim=-1).unsqueeze(-1)
        
        if i == 0:
            torch.cuda.synchronize()
            first_token_time = time.time()
            ttft = first_token_time - start_time
            print(f"**Time to First Token (TTFT): {ttft:.4f} seconds**")
        
        if next_token_id.item() == tokenizer.eos_token_id:
            break
            
        output_ids = torch.cat([output_ids, next_token_id], dim=-1)
        
        current_input_ids = next_token_id
        
        new_attention_mask = torch.ones((1, 1), dtype=torch.long, device='cuda')
        attention_mask = torch.cat([attention_mask, new_attention_mask], dim=1)
        
        past_key_values = outputs.past_key_values
        generated_tokens += 1

response = tokenizer.decode(output_ids[0], skip_special_tokens=True)
print("\n--- Model Response ---")
print(response)
print(f"Total tokens generated (excluding prompt): {generated_tokens}")