In [1]:
from llama_index.llms import LlamaCPP
from llama_index.llms.llama_utils import messages_to_prompt, completion_to_prompt
from llama_index import Document
from llama_index import VectorStoreIndex, StorageContext, load_index_from_storage
from llama_index import LLMPredictor, PromptHelper, ServiceContext

import pandas as pd

In [2]:
llm = LlamaCPP(
    model_path="../../../llama/llama-2-7b-chat/ggml-model-q4_0.bin",
    temperature=0.1,
    max_new_tokens=256,
    # llama2 has a context window of 4096 tokens, but we set it lower to allow for some wiggle room
    context_window=3900,
    # kwargs to pass to __call__()
    generate_kwargs={},
    # kwargs to pass to __init__()
    # set to at least 1 to use GPU
    model_kwargs={"n_gpu_layers": 1},
    # transform inputs into Llama2 format
    messages_to_prompt=messages_to_prompt,
    completion_to_prompt=completion_to_prompt,
    verbose=True,
)

llama.cpp: loading model from ../../../llama/llama-2-7b-chat/ggml-model-q4_0.bin
llama_model_load_internal: format     = ggjt v3 (latest)
llama_model_load_internal: n_vocab    = 32000
llama_model_load_internal: n_ctx      = 3900
llama_model_load_internal: n_embd     = 4096
llama_model_load_internal: n_mult     = 256
llama_model_load_internal: n_head     = 32
llama_model_load_internal: n_head_kv  = 32
llama_model_load_internal: n_layer    = 32
llama_model_load_internal: n_rot      = 128
llama_model_load_internal: n_gqa      = 1
llama_model_load_internal: rnorm_eps  = 1.0e-06
llama_model_load_internal: n_ff       = 11008
llama_model_load_internal: freq_base  = 10000.0
llama_model_load_internal: freq_scale = 1
llama_model_load_internal: ftype      = 2 (mostly Q4_0)
llama_model_load_internal: model size = 7B
llama_model_load_internal: ggml ctx size =    0.08 MB
llama_model_load_internal: mem required  = 4160.96 MB (+ 1950.00 MB per state)
llama_new_context_with_model: kv self size  = 1950.

In [37]:
def load_documents():
    df = pd.read_json("../../data/handbook_qa/data/qAndA.jsonl", lines=True)
    df["prompt"]=df["prompt"]+"?"
    documents = [Document(text=str(i)) for i in df.values]

    return documents

In [38]:
documents = load_documents()

In [39]:
documents[10].text

'[\'What are some of the shortcodes in the handbook theme ?\'\n "Some of the shortcodes in the handbook theme include figure, gist, highlight, and KaTeX, as well as custom ones specific to the theme such as hints, expand, and tabs. Shortcodes are templates that can be parametrized and can be included in the content section of a Markdown file to enable more complex features than Markdown\'s simple syntax allows."]'

In [40]:
def create_service_context(
        model, 
        max_input_size=1024,
        num_output=128,
        chunk_size_lim=256,
        overlap_ratio=0.1
    ):
    llm_predictor=LLMPredictor(llm=model)
    prompt_helper=PromptHelper(max_input_size,num_output,overlap_ratio,chunk_size_lim)
    service_context = ServiceContext.from_defaults(llm_predictor=llm_predictor, prompt_helper=prompt_helper, embed_model="local")
    return service_context

In [41]:
service_context = create_service_context(llm)
index = VectorStoreIndex.from_documents(documents, service_context=service_context)

  from .autonotebook import tqdm as notebook_tqdm


In [42]:
query_engine = index.as_query_engine()

In [43]:
response = query_engine.query("Who are REG")
print(response.response)

  Based on the provided context information, I understand that REG refers to the Research and Engineering Group (REG) within the Turing Institute.
According to the context, the REG newsletter is sent out monthly around a week before the monthly team meeting, and the all-REG meeting is also held monthly where new joiners are welcomed and updates from around REG or the Turing are presented, followed by a discussion on a topic of interest for the wider team.
Therefore, based on the information provided, REG appears to be a group within the Turing Institute that focuses on research and engineering activities, and has regular meetings to share updates and discuss topics of interest.



llama_print_timings:        load time =  5081.84 ms
llama_print_timings:      sample time =   104.11 ms /   148 runs   (    0.70 ms per token,  1421.53 tokens per second)
llama_print_timings: prompt eval time =  5081.79 ms /   238 tokens (   21.35 ms per token,    46.83 tokens per second)
llama_print_timings:        eval time =  7902.76 ms /   147 runs   (   53.76 ms per token,    18.60 tokens per second)
llama_print_timings:       total time = 13274.83 ms


In [44]:
print(response.source_nodes[0].node.text)

['What is the REG newsletter and how often is it sent out?'
 'The REG newsletter is a monthly email containing short project updates and other news/updates from around the team and the institute. It is sent around a week before the monthly team meeting and comes to the Hut23 mailing list.']


In [48]:
response = query_engine.query("Where is the Alan Turing Institute based?")
print(response.response)

Llama.generate: prefix-match hit


  Thank you for asking! The Alan Turing Institute is based in London, UK.



llama_print_timings:        load time =  5081.84 ms
llama_print_timings:      sample time =    13.43 ms /    19 runs   (    0.71 ms per token,  1415.27 tokens per second)
llama_print_timings: prompt eval time =  4391.42 ms /   212 tokens (   20.71 ms per token,    48.28 tokens per second)
llama_print_timings:        eval time =   957.65 ms /    18 runs   (   53.20 ms per token,    18.80 tokens per second)
llama_print_timings:       total time =  5386.18 ms
