<a href="https://colab.research.google.com/github/anushadudi/inference_latency_optimization/blob/main/hf_kv_torch_compile_bench.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install transformers datasets evaluate accelerate torch

Collecting datasets
  Downloading datasets-3.3.1-py3-none-any.whl.metadata (19 kB)
Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Co

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer,AutoModelForSeq2SeqLM
import torch
from huggingface_hub import login
import time
import pandas as pd
from google.colab import userdata

questions = [
    # Coding questions
    "Implement a Python function to compute the Fibonacci numbers.",
    "Write a Rust function that performs binary exponentiation.",
    "What are the differences between Javascript and Python?",
    # Literature
    "Write a story in the style of James Joyce about a trip to the Australian outback in 2083, to see robots in the beautiful desert.",
    "Who does Harry turn into a balloon?",
    "Write a tale about a time-traveling historian who's determined to witness the most significant events in human history.",
    # Math
    "What is the product of 9 and 8?",
    "If a train travels 120 kilometers in 2 hours, what is its average speed?",
    "Think through this step by step. If the sequence a_n is defined by a_1 = 3, a_2 = 5, and a_n = a_(n-1) + a_(n-2) for n > 2, find a_6.",
]

def apply_kvcache_torchcompile(model):
  # currently, `torch.compile(model, ...)` is not working with transformers' `PretrainedModel`:
  # code is still running but no compilation is happening (therefore no speedup in the subsequent calls to `generate`)
  model.forward = torch.compile(model.forward, mode="reduce-overhead", fullgraph=True)
  model.generation_config.cache_implementation = "static"

def initiateModel(device):
  model_id = "meta-llama/Meta-Llama-3.1-8B-Instruct"
  login(token=userdata.get('HF_TOKEN'))
  model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.bfloat16)
  model.to(device)
  apply_kvcache_torchcompile(model)
  return model

def initiateTokenizer():
  model_id = "meta-llama/Meta-Llama-3.1-8B-Instruct"
  tokenizer = AutoTokenizer.from_pretrained(model_id)
  tokenizer.add_special_tokens({'pad_token': '[PAD]'})
  return tokenizer


In [None]:
import torch
from transformers import GenerationConfig
def generate(question, model, tokenizer, device):
  # generation_config = GenerationConfig(
  #   max_new_tokens=200, top_p=0.01, temperature=0.01, do_sample=True, pad_token_id=tokenizer.eos_token_id, cache_implementation="static"
  # )
  model.generation_config.max_new_tokens = 200
  model.generation_config.top_p = 0.01
  model.generation_config.temperature = 0.01
  model.generation_config.do_sample = True
  model.generation_config.pad_token_id = tokenizer.eos_token_id
  start = time.perf_counter()
  inputs = tokenizer(question, return_tensors="pt").to(device)
  result = model.generate(**inputs)
  decoded_output = tokenizer.batch_decode(result)[0]
  request_time = time.perf_counter() - start
  response = {'question': question}
  response['tok_count'] = len(result[0])
  response['time'] = request_time
  response['answer'] = decoded_output
  response['tokens_per_second'] = len(result[0]) / request_time
  response['ms_per_seq_output_token'] = request_time * 1000 / len(result[0])
  return response

In [None]:
def run_benchmark(model, tokenizer, device):
    counter = 1
    responses = []

    for q in questions:
        response = generate(question=q, model=model, tokenizer=tokenizer, device=device)
        if counter >= 2:
            responses.append(response)
        counter += 1

    df = pd.DataFrame(responses)
    df.to_csv('bench-hf-kv-torchcompile.csv', index=False)
    return df

In [None]:
device = "cuda"
model = initiateModel(device)

config.json:   0%|          | 0.00/855 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/4 [00:00<?, ?it/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/184 [00:00<?, ?B/s]

In [None]:
tokenizer = initiateTokenizer()

tokenizer_config.json:   0%|          | 0.00/55.4k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/296 [00:00<?, ?B/s]

In [None]:
result = run_benchmark(model, tokenizer, device)
result

Unnamed: 0,question,tok_count,time,answer,tokens_per_second,ms_per_seq_output_token
0,Write a Rust function that performs binary exp...,211,3.448814,<|begin_of_text|>Write a Rust function that pe...,61.180448,16.345091
1,What are the differences between Javascript an...,210,5.641272,<|begin_of_text|>What are the differences betw...,37.225645,26.863202
2,Write a story in the style of James Joyce abou...,231,5.64444,<|begin_of_text|>Write a story in the style of...,40.925229,24.434805
3,Who does Harry turn into a balloon?,209,5.67464,<|begin_of_text|>Who does Harry turn into a ba...,36.830528,27.151389
4,Write a tale about a time-traveling historian ...,224,5.639182,<|begin_of_text|>Write a tale about a time-tra...,39.72207,25.174921
5,What is the product of 9 and 8?,212,5.65589,<|begin_of_text|>What is the product of 9 and ...,37.483051,26.678725
6,"If a train travels 120 kilometers in 2 hours, ...",219,5.635089,<|begin_of_text|>If a train travels 120 kilome...,38.86363,25.730998
7,Think through this step by step. If the sequen...,258,3.451387,<|begin_of_text|>Think through this step by st...,74.752557,13.377469
