# Load questions

In [1]:
f = open("questions.txt", "r")
questions = f.readlines()
questions[:5]

['Can you create a short story based on a specific theme?\n',
 'Can you help me design a workout plan for weight loss?\n',
 'What are some creative date ideas?\n',
 'Can you suggest a list of books to read across different genres?\n',
 'How can I reduce my carbon footprint?\n']

In [None]:
from transformers import AutoTokenizer
import transformers
import torch
import time
from tqdm import tqdm

model = "meta-llama/Llama-2-7b-chat-hf"

tokenizer = AutoTokenizer.from_pretrained(model)
pipeline = transformers.pipeline(
    "text-generation",
    model=model,
    torch_dtype=torch.float16,
    device_map="cuda",
)

start = time.time()
for question in tqdm(questions):
    sequences = pipeline(
        question,
        do_sample=True,
        top_k=10,
        num_return_sequences=1,
        eos_token_id=tokenizer.eos_token_id,
        max_length=200,
    )
end = time.time()

inference_time = end - start
num_requests = len(questions)

print(f"Total inference time: {round(inference_time, 4)}s")
print(f"Total sample: {num_requests}")
print(f"Result: {round(num_requests / inference_time)} sample/s")
print('---------------------------------------------------------')