# Getting Started

## Using Ctransformers
In general `ggml` and `gptq` are not supported by HuggingFace Transformers. However, this work using Ctransformers

In [1]:
from ctransformers import AutoModelForCausalLM

In [2]:
model_path = r"D:/llama2_quantized_models/7B_q5/llama-2-7b-chat.ggmlv3.q5_K_M.bin"
model_name = "llama-2-7b-chat.ggmlv3.q5_K_M.bin"

# check ctransformers doc for more configs
config = {'max_new_tokens': 256, 'repetition_penalty': 1.1, 
          'temperature': 0.1, 'stream': True}

llm = AutoModelForCausalLM.from_pretrained(model_path_or_repo_id=model_path, model_file=model_name, gpu_layers=35, model_type='llama', **config)

In [3]:
prompt="""Write a poem to help me remember the first 10 elements on the periodic table, giving each
element its own line."""

In [4]:
# Tokenize the prompt
tokens = llm.tokenize(prompt)
print(tokens)

[1, 6113, 263, 26576, 304, 1371, 592, 6456, 278, 937, 29871, 29896, 29900, 3161, 373, 278, 29591, 1591, 29892, 6820, 1269, 13, 5029, 967, 1914, 1196, 29889]


In [None]:
# Pipeline Execution
llm(prompt, stream=False)


In [None]:
# Stream execution for Generation + Stats
import time
start = time.time()

NUM_TOKENS=0
print('-'*4+'Start Generation'+'-'*4)
for token in llm.generate(tokens):
    print(llm.detokenize(token), end='', flush=True)
    NUM_TOKENS+=1
time_generate = time.time() - start
print('\n')
print('-'*4+'End Generation'+'-'*4)
print(f'Num of generated tokens: {NUM_TOKENS}')
print(f'Time for complete generation: {time_generate}s')
print(f'Tokens per secound: {NUM_TOKENS/time_generate}')
print(f'Time per token: {(time_generate/NUM_TOKENS)*1000}ms')

## Using Llama_cpp

In [14]:
from llama_cpp import Llama

In [19]:
model_path = r"D:/llama2_quantized_models/7B_q5/llama-2-7b-chat.ggmlv3.q5_K_M.bin"
model_name = "llama-2-7b-chat.ggmlv3.q5_K_M.bin"

llm = Llama(model_path=model_path,
            n_gpu_layers=32, 
            n_ctx=8192, 
            n_batch=512)

AVX = 1 | AVX2 = 1 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 1 | NEON = 0 | ARM_FMA = 0 | F16C = 1 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | VSX = 0 | 


In [20]:
template = """
    [INST] <<SYS>>
    You are a helpful, respectful and honest assistant. 
    Always answer as helpfully as possible, while being safe.  
    Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. 
    Please ensure that your responses are socially unbiased and positive in nature.
    If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. 
    If you don't know the answer to a question, please don't share false information.
    <</SYS>>
    {INSERT_PROMPT_HERE} [/INST]
    """

In [21]:
# create prompt
prompt = 'Write me a resignation letter'

In [22]:
output = llm(prompt, 
                 max_tokens=-1, 
                 echo=False, 
                 temperature=0.2, 
                 top_p=0.1)

In [34]:
print(output['choices'][0]['text'])

 due to bullying at work

I am writing to inform you of my decision to resign from my position as [position] at [company name], effective [date of last day of work]. The reason for my resignation is the persistent and unacceptable behavior of [name of bully or supervisor], which has created a hostile work environment for me.
Despite my efforts to address this issue through the company's reporting mechanisms, including speaking with [name of HR representative or supervisor], I have not seen any meaningful action taken to address the problem. As a result, I feel that it is necessary to take this step in order to protect my own well-being and maintain my dignity as an employee.
I want to make it clear that I have enjoyed my time at [company name] and have appreciated the opportunities for growth and development that I have had here. However, I cannot continue to work in an environment where I am subjected to bullying and harassment on a regular basis.
I will do everything possible to ensu