## Step 1: Instal all the required packages

In [None]:
# GPU llama-cpp-python
!CMAKE_ARGS="-DLLAMA_CUBLAS=on" FORCE_CMAKE=1 pip install llama-cpp-python --force-reinstall --upgrade --no-cache-dir --verbose

# For download the models
!pip install huggingface_hub

## Step 2: Import all the required libraries

In [None]:
from huggingface_hub import hf_hub_download
from llama_cpp import Llama

## Step 3: Download the Models

In [None]:
model_name_or_path = "TheBloke/Llama-2-13B-chat-GGML"
model_basename = "llama-2-13b-chat.ggmlv3.q5_1.bin" # the model is in bin format
model_path = hf_hub_download(repo_id=model_name_or_path, filename=model_basename)

## Step 4: Loading the model

In [None]:
model_path = r"D:/llama2_quantized_models/7B_chat/llama-2-7b-chat.ggmlv3.q8_0.bin"

# GPU
lcpp_llm = None
lcpp_llm = Llama(
    model_path=model_path,
    n_threads=2, # CPU cores
    n_batch=512, # Should be between 1 and n_ctx, consider the amount of VRAM in your GPU.
    n_gpu_layers=32 # Change this value based on your model and your GPU VRAM pool.
    )

In [None]:
# See the number of layers in GPU
lcpp_llm.params.n_gpu_layers

## Step 5: Create a Prompt Template

In [None]:
prompt = "Write a linear regression in python and plot it using seaborn"
prompt_template=f'''SYSTEM: You are a helpful, respectful and honest assistant. Always answer as helpfully.

USER: {prompt}

ASSISTANT:
'''

## Step 6: Generating the Response

In [None]:
response=lcpp_llm(prompt=prompt_template, max_tokens=256, temperature=0.5, top_p=0.95,
                  repeat_penalty=1.2, top_k=150,
                  echo=True)
    

In [None]:
print(response)

In [None]:
print(response["choices"][0]["text"])