-
Notifications
You must be signed in to change notification settings - Fork 0
/
llama-test.py
31 lines (23 loc) · 1.2 KB
/
llama-test.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
from huggingface_hub import hf_hub_download
from llama_cpp import Llama
model_name_or_path = "TheBloke/Llama-2-7B-Chat-GGUF" # Change this to the GGUF model name
model_basename = "llama-2-7b-chat.Q4_0.gguf" # Change this to the GGUF model basename
# model_path = hf_hub_download(repo_id=model_name_or_path, filename=model_basename)
model_path = "/Users/abhiraj/.cache/huggingface/hub/models--TheBloke--Llama-2-7B-Chat-GGUF/snapshots/191239b3e26b2882fb562ffccdd1cf0f65402adb/llama-2-7b-chat.Q4_0.gguf"
print(model_path)
lcpp_llm = None
lcpp_llm = Llama(
model_path=model_path,
n_threads=4, # CPU cores
n_batch=256, # Should be between 1 and n_ctx, consider the amount of VRAM in your GPU.
n_gpu_layers=2 # Change this value based on your model and your GPU VRAM pool.
)
prompt = "Summarize the 'Attention is all you need' research paper in 250 words."
prompt_template=f'''SYSTEM: You are a helpful, respectful and honest assistant. Always answer as helpfully.
USER: {prompt}
ASSISTANT:
'''
response=lcpp_llm(prompt=prompt_template, max_tokens=256, temperature=0.5, top_p=0.95,
repeat_penalty=1.2, top_k=150,
echo=True)
print("Answer:\n", response["choices"][0]["text"][len(prompt_template):])