In [50]:
import pandas as pd

from langchain.embeddings.huggingface import HuggingFaceEmbeddings

from llama_cpp import Llama

from llama_index import (
    SimpleDirectoryReader,
    LangchainEmbedding,
    GPTListIndex,
    GPTVectorStoreIndex,
    PromptHelper,
    LLMPredictor,
    ServiceContext,
    Document
)
from llama_index.llms import LlamaCPP
from llama_index.llms.llama_utils import messages_to_prompt, completion_to_prompt

In [66]:
import llama_index
llama_index.__version__

'0.8.14'

Note: notebook assumes that in the reginald directory, there is a `gguf_models/` folder. Here we've downloaded the quantized 4-bit version of Llama2-13b-chat from [`TheBloke/Llama-2-13B-chat-GGML`](https://huggingface.co/TheBloke/Llama-2-13B-chat-GGML). 

Note that we're currently running a version of `llama-cpp-python` which no longer supports `ggmmlv3` model formats and has changed to `gguf`. We need to convert the above to `gguf` format using the `convert-llama-ggmlv3-to-gguf.py` script in [`llama.cpp`](https://github.com/ggerganov/llama.cpp).

## Quick example with llama-cpp-python

In [19]:
llama_2_13b_chat_path = "../../gguf_models/llama-2-13b-chat.gguf.q4_K_S.bin"

## Using metal acceleration

In [35]:
llm = Llama(model_path=llama_2_13b_chat_path, n_gpu_layers=1)

llama_model_loader: loaded meta data with 18 key-value pairs and 363 tensors from ../../gguf_models/llama-2-13b-chat.gguf.q4_K_S.bin (version GGUF V2 (latest))
llama_model_loader: - tensor    0:                token_embd.weight q4_K     [  5120, 32000,     1,     1 ]
llama_model_loader: - tensor    1:               output_norm.weight f32      [  5120,     1,     1,     1 ]
llama_model_loader: - tensor    2:                    output.weight q6_K     [  5120, 32000,     1,     1 ]
llama_model_loader: - tensor    3:              blk.0.attn_q.weight q4_K     [  5120,  5120,     1,     1 ]
llama_model_loader: - tensor    4:              blk.0.attn_k.weight q4_K     [  5120,  5120,     1,     1 ]
llama_model_loader: - tensor    5:              blk.0.attn_v.weight q4_K     [  5120,  5120,     1,     1 ]
llama_model_loader: - tensor    6:         blk.0.attn_output.weight q4_K     [  5120,  5120,     1,     1 ]
llama_model_loader: - tensor    7:           blk.0.attn_norm.weight f32      [  5120

llama_new_context_with_model: compute buffer total size =   91.47 MB
llama_new_context_with_model: max tensor size =   128.17 MB
ggml_metal_add_buffer: allocated 'data            ' buffer, size =  7024.61 MB, (14549.28 / 21845.34)
ggml_metal_add_buffer: allocated 'eval            ' buffer, size =     1.48 MB, (14550.77 / 21845.34)
ggml_metal_add_buffer: allocated 'kv              ' buffer, size =   402.00 MB, (14952.77 / 21845.34)
AVX = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 0 | NEON = 1 | ARM_FMA = 1 | F16C = 0 | FP16_VA = 1 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 0 | SSSE3 = 0 | VSX = 0 | 
ggml_metal_add_buffer: allocated 'alloc           ' buffer, size =    90.02 MB, (15042.78 / 21845.34)


In [36]:
prompt_example = "Name all the planets in the solar system and state their distances to the sun"

In [37]:
output = llm(prompt_example,
             max_tokens=512,
             echo=True)


llama_print_timings:        load time =   552.96 ms
llama_print_timings:      sample time =   179.55 ms /   256 runs   (    0.70 ms per token,  1425.80 tokens per second)
llama_print_timings: prompt eval time =   552.93 ms /    17 tokens (   32.53 ms per token,    30.75 tokens per second)
llama_print_timings:        eval time = 14287.03 ms /   255 runs   (   56.03 ms per token,    17.85 tokens per second)
llama_print_timings:       total time = 15342.45 ms


In [38]:
print(output)

{'id': 'cmpl-618337c4-bc4d-4818-99d4-e87893cf21fb', 'object': 'text_completion', 'created': 1693518842, 'model': '../../gguf_models/llama-2-13b-chat.gguf.q4_K_S.bin', 'choices': [{'text': "Name all the planets in the solar system and state their distances to the sun.\n\nThere are eight planets in the solar system: Mercury, Venus, Earth, Mars, Jupiter, Saturn, Uranus, and Neptune. Here is a list of each planet along with its distance from the Sun (in astronomical units or AU):\n\n1. Mercury - 0.4 AU (very close to the Sun)\n2. Venus - 1.0 AU (just inside Earth's orbit)\n3. Earth - 1.0 AU (the distance from the Earth to the Sun is called an astronomical unit, or AU)\n4. Mars - 1.6 AU (about 1.5 times the distance from the Earth to the Sun)\n5. Jupiter - 5.2 AU (about 5 times the distance from the Earth to the Sun)\n6. Saturn - 9.5 AU (almost twice the distance from the Earth to the Sun)\n7. Uranus - 19.0 AU (about 4 times the distance from the Earth to the Sun)\n8. Neptune - 30.1 AU (mor

In [39]:
print(output["choices"][0]["text"])

Name all the planets in the solar system and state their distances to the sun.

There are eight planets in the solar system: Mercury, Venus, Earth, Mars, Jupiter, Saturn, Uranus, and Neptune. Here is a list of each planet along with its distance from the Sun (in astronomical units or AU):

1. Mercury - 0.4 AU (very close to the Sun)
2. Venus - 1.0 AU (just inside Earth's orbit)
3. Earth - 1.0 AU (the distance from the Earth to the Sun is called an astronomical unit, or AU)
4. Mars - 1.6 AU (about 1.5 times the distance from the Earth to the Sun)
5. Jupiter - 5.2 AU (about 5 times the distance from the Earth to the Sun)
6. Saturn - 9.5 AU (almost twice the distance from the Earth to the Sun)
7. Uranus - 19.0 AU (about 4 times the distance from the Earth to the Sun)
8. Neptune - 30.1 AU (more than 


## Using CPU

In [40]:
llm = Llama(model_path=llama_2_13b_chat_path)

llama_model_loader: loaded meta data with 18 key-value pairs and 363 tensors from ../../gguf_models/llama-2-13b-chat.gguf.q4_K_S.bin (version GGUF V2 (latest))
llama_model_loader: - tensor    0:                token_embd.weight q4_K     [  5120, 32000,     1,     1 ]
llama_model_loader: - tensor    1:               output_norm.weight f32      [  5120,     1,     1,     1 ]
llama_model_loader: - tensor    2:                    output.weight q6_K     [  5120, 32000,     1,     1 ]
llama_model_loader: - tensor    3:              blk.0.attn_q.weight q4_K     [  5120,  5120,     1,     1 ]
llama_model_loader: - tensor    4:              blk.0.attn_k.weight q4_K     [  5120,  5120,     1,     1 ]
llama_model_loader: - tensor    5:              blk.0.attn_v.weight q4_K     [  5120,  5120,     1,     1 ]
llama_model_loader: - tensor    6:         blk.0.attn_output.weight q4_K     [  5120,  5120,     1,     1 ]
llama_model_loader: - tensor    7:           blk.0.attn_norm.weight f32      [  5120

a_model_loader: - tensor  288:           blk.31.ffn_down.weight q4_K     [ 13824,  5120,     1,     1 ]
llama_model_loader: - tensor  289:             blk.31.ffn_up.weight q4_K     [  5120, 13824,     1,     1 ]
llama_model_loader: - tensor  290:           blk.31.ffn_norm.weight f32      [  5120,     1,     1,     1 ]
llama_model_loader: - tensor  291:             blk.32.attn_q.weight q4_K     [  5120,  5120,     1,     1 ]
llama_model_loader: - tensor  292:             blk.32.attn_k.weight q4_K     [  5120,  5120,     1,     1 ]
llama_model_loader: - tensor  293:             blk.32.attn_v.weight q4_K     [  5120,  5120,     1,     1 ]
llama_model_loader: - tensor  294:        blk.32.attn_output.weight q4_K     [  5120,  5120,     1,     1 ]
llama_model_loader: - tensor  295:          blk.32.attn_norm.weight f32      [  5120,     1,     1,     1 ]
llama_model_loader: - tensor  296:           blk.32.ffn_gate.weight q4_K     [  5120, 13824,     1,     1 ]
llama_model_loader: - tensor  29

In [43]:
output = llm(prompt_example,
             max_tokens=512,
             echo=True)

Llama.generate: prefix-match hit

llama_print_timings:        load time =  1496.01 ms
llama_print_timings:      sample time =   182.77 ms /   256 runs   (    0.71 ms per token,  1400.66 tokens per second)
llama_print_timings: prompt eval time =     0.00 ms /     1 tokens (    0.00 ms per token,      inf tokens per second)
llama_print_timings:        eval time = 21947.42 ms /   256 runs   (   85.73 ms per token,    11.66 tokens per second)
llama_print_timings:       total time = 22482.00 ms


By inspection, we can see that the metal acceleration is faster as expected.

In [45]:
print(output["choices"][0]["text"])

Name all the planets in the solar system and state their distances to the sun.

There are eight planets in our solar system, which are: Mercury, Venus, Earth, Mars, Jupiter, Saturn, Uranus and Neptune. Here's a list of the planets in order from closest to farthest from the Sun:

1. Mercury - 57,909,227 km (0.38 AU)
2. Venus - 108,208,930 km (0.72 AU)
3. Earth - 149,597,890 km (1 AU)
4. Mars - 226,650,000 km (1.38 AU)
5. Jupiter - 778,299,000 km (5.2 AU)
6. Saturn - 1,426,666,400 km (9.5 AU)
7. Uranus - 2,870,972,200 km (19.2 AU)
8. Neptune - 


## Using Llama2 in `llama-index`

In [46]:
llm = LlamaCPP(
    model_path="../../gguf_models/llama-2-13b-chat.gguf.q4_K_S.bin",
    temperature=0.1,
    max_new_tokens=256,
    # llama2 has a context window of 4096 tokens,
    # but we set it lower to allow for some wiggle room
    context_window=3900,
    # kwargs to pass to __call__()
    generate_kwargs={},
    # kwargs to pass to __init__()
    # set to at least 1 to use GPU
    model_kwargs={"n_gpu_layers": 1},
    # transform inputs into Llama2 format
    messages_to_prompt=messages_to_prompt,
    completion_to_prompt=completion_to_prompt,
    verbose=True,
)

llama_model_loader: loaded meta data with 18 key-value pairs and 363 tensors from ../../gguf_models/llama-2-13b-chat.gguf.q4_K_S.bin (version GGUF V2 (latest))
llama_model_loader: - tensor    0:                token_embd.weight q4_K     [  5120, 32000,     1,     1 ]
llama_model_loader: - tensor    1:               output_norm.weight f32      [  5120,     1,     1,     1 ]
llama_model_loader: - tensor    2:                    output.weight q6_K     [  5120, 32000,     1,     1 ]
llama_model_loader: - tensor    3:              blk.0.attn_q.weight q4_K     [  5120,  5120,     1,     1 ]
llama_model_loader: - tensor    4:              blk.0.attn_k.weight q4_K     [  5120,  5120,     1,     1 ]
llama_model_loader: - tensor    5:              blk.0.attn_v.weight q4_K     [  5120,  5120,     1,     1 ]
llama_model_loader: - tensor    6:         blk.0.attn_output.weight q4_K     [  5120,  5120,     1,     1 ]
llama_model_loader: - tensor    7:           blk.0.attn_norm.weight f32      [  5120

llama_new_context_with_model: compute buffer total size =  356.16 MB
llama_new_context_with_model: max tensor size =   128.17 MB
AVX = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 0 | NEON = 1 | ARM_FMA = 1 | F16C = 0 | FP16_VA = 1 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 0 | SSSE3 = 0 | VSX = 0 | 
ggml_metal_free: deallocating


In [56]:
handbook = pd.read_csv("../../data/public/handbook-scraped.csv")
turing = pd.read_csv("../../data/public/turingacuk-no-boilerplate.csv")

text_list = list(handbook["body"].astype("str")) + list(turing["body"].astype("str"))
documents = [Document(text=t) for t in text_list]

In [60]:
hfemb = HuggingFaceEmbeddings()
embed_model = LangchainEmbedding(hfemb)

In [61]:
# set number of output tokens
num_output = 256
# set maximum input size
max_input_size = 2048
# set maximum chunk overlap
chunk_size_limit = 1024
chunk_overlap_ratio = 0.1

prompt_helper = PromptHelper(
    context_window=max_input_size,
    num_output=num_output,
    chunk_size_limit=chunk_size_limit,
    chunk_overlap_ratio=chunk_overlap_ratio,
)

In [62]:
 service_context = ServiceContext.from_defaults(
    llm_predictor=llm,
    embed_model=embed_model,
    prompt_helper=prompt_helper,
    chunk_size_limit=chunk_size_limit,
)

index = GPTVectorStoreIndex.from_documents(
    documents, service_context=service_context
)

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/rchan/Library/Caches/llama_index...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [63]:
query_engine = index.as_query_engine()

In [65]:
response = query_engine.query("what should a new starter in REG do?")
print(response.response)

AttributeError: 'LlamaCPP' object has no attribute 'predict'

## Chat engine

In [None]:
chat_engine = index.as_chat_engine(chat_mode="react", verbose=True)

In [None]:
response = chat_engine.chat(
    "what should a new starter in REG do?"
)

In [None]:
print(response)

In [None]:
response = chat_engine.chat("What did I ask you before?")

In [None]:
print(response)

Reset chat engine...

In [None]:
chat_engine.reset()

In [None]:
response = chat_engine.chat("What did I ask you before?")

In [None]:
print(response)