In [None]:
from llama_cpp import Llama

In [None]:
# Variables
MODEL_ID = "llama-7b"
QUANTIZATION_METHODS = ["q4_k_m", "q5_k_m"] #"q4_k_m" or "q5"
# Constants
MODEL_NAME = MODEL_ID.split('/')[-1]
GGML_VERSION = "gguf"

# Convert to fp16: Converting a model to use float16 instead of float32 
# can decrease the model size (up to half) and improve performance on some GPUs. 
fp16 = f"{MODEL_NAME}/{MODEL_NAME.lower()}.{GGML_VERSION}.fp16.bin"
!python /Users/astridz/Documents/Development/llama.cpp/convert.py {MODEL_NAME} --outtype f16 --outfile {fp16}

# Quantize the model for each method in the QUANTIZATION_METHODS list
for method in QUANTIZATION_METHODS:
    qtype = f"{MODEL_NAME}/{MODEL_NAME.lower()}.{GGML_VERSION}.{method}.bin"
    !./llama.cpp/quantize {fp16} {qtype} {method}

Our two quantized models are now ready for inference. 

## Run inference

In [None]:
import os

Let’s use llama.cpp to efficiently run them. we’ll use the -ngl 35 parameter.

In [None]:
model_list = [file for file in os.listdir(MODEL_NAME) if GGML_VERSION in file]

prompt = input("Enter your prompt: ")
chosen_method = input("Please specify the quantization method to run the model (options: " + ", ".join(model_list) + "): ")

#input a prompt
if chosen_method not in model_list:
    print("Invalid method chosen!")
else:
    qtype = f"{MODEL_NAME}/{MODEL_NAME.lower()}.{GGML_VERSION}.{method}.bin"
    !./llama.cpp/main -m {qtype} -n 128 --color -ngl 35 -p "{prompt}"

## Langchain Framework

In [1]:
from langchain import PromptTemplate, FewShotPromptTemplate
from langchain.chains import LLMChain
from langchain.llms import LlamaCpp

Example with multiple input variable

In [None]:
llm = LlamaCpp(model_path= fp16)

template = """
Q: Tell me a recipe based on {ingredient} by using one of following methods: {Utensil}. \n

A: 

"""

prompt = PromptTemplate(input_variables=["ingredient", "utensil"], template=template)
        
# prompt = PromptTemplate.format_template(template)
# format_prompt = prompt.format(ingredient = "", Utensil = "stir fry, air fry")
llm_chain = LLMChain(prompt=prompt , llm = llm)
llm_chain.run()

In [None]:
# CHECKING
prompt
prompt.input_variables 
prompt.template