In [1]:
import pandas as pd

from langchain.embeddings.huggingface import HuggingFaceEmbeddings

from llama_cpp import Llama

from llama_index import (
    SimpleDirectoryReader,
    LangchainEmbedding,
    VectorStoreIndex,
    PromptHelper,
    LLMPredictor,
    ServiceContext,
    Document
)
from llama_index.llms import LlamaCPP
from llama_index.llms.llama_utils import messages_to_prompt, completion_to_prompt

In [2]:
import llama_index
llama_index.__version__

'0.8.22'

Note: notebook assumes that in the reginald directory, there is a `gguf_models/` folder. Here we've downloaded the quantized 6-bit version of Llama-2-13b-Chat from [`TheBloke/Llama-2-13b-Chat-GGUF`](https://huggingface.co/TheBloke/Llama-2-13b-Chat-GGUF). 

## Quick example with llama-cpp-python

In [3]:
llama_2_path = "../../gguf_models/llama-2-13b-chat.Q6_K.gguf"

## Using metal acceleration

In [4]:
llm = Llama(model_path=llama_2_path, n_gpu_layers=1)

llama_model_loader: loaded meta data with 19 key-value pairs and 363 tensors from ../../gguf_models/llama-2-13b-chat.Q6_K.gguf (version GGUF V2 (latest))
llama_model_loader: - tensor    0:                token_embd.weight q6_K     [  5120, 32000,     1,     1 ]
llama_model_loader: - tensor    1:           blk.0.attn_norm.weight f32      [  5120,     1,     1,     1 ]
llama_model_loader: - tensor    2:            blk.0.ffn_down.weight q6_K     [ 13824,  5120,     1,     1 ]
llama_model_loader: - tensor    3:            blk.0.ffn_gate.weight q6_K     [  5120, 13824,     1,     1 ]
llama_model_loader: - tensor    4:              blk.0.ffn_up.weight q6_K     [  5120, 13824,     1,     1 ]
llama_model_loader: - tensor    5:            blk.0.ffn_norm.weight f32      [  5120,     1,     1,     1 ]
llama_model_loader: - tensor    6:              blk.0.attn_k.weight q6_K     [  5120,  5120,     1,     1 ]
llama_model_loader: - tensor    7:         blk.0.attn_output.weight q6_K     [  5120,  512

In [5]:
prompt_example = "Name all the planets in the solar system and state their distances to the sun"

In [6]:
output = llm(prompt_example, max_tokens=512, echo=True)


llama_print_timings:        load time =   567.44 ms
llama_print_timings:      sample time =   229.39 ms /   327 runs   (    0.70 ms per token,  1425.54 tokens per second)
llama_print_timings: prompt eval time =   567.41 ms /    17 tokens (   33.38 ms per token,    29.96 tokens per second)
llama_print_timings:        eval time = 30608.18 ms /   326 runs   (   93.89 ms per token,    10.65 tokens per second)
llama_print_timings:       total time = 31823.15 ms


In [7]:
print(output)

{'id': 'cmpl-35d1cb16-69fa-4ff5-9bf3-aaf53d3e866d', 'object': 'text_completion', 'created': 1694191828, 'model': '../../gguf_models/llama-2-13b-chat.Q6_K.gguf', 'choices': [{'text': 'Name all the planets in the solar system and state their distances to the sun.\n\nThere are eight planets in the solar system: Mercury, Venus, Earth, Mars, Jupiter, Saturn, Uranus, and Neptune. Here is a list of the planets in order from closest to farthest from the Sun:\n\n1. Mercury - 57,909,227 km (0.387 AU)\n2. Venus - 108,208,930 km (0.723 AU)\n3. Earth - 149,597,870 km (1 AU)\n4. Mars - 225,000,000 km (1.381 AU)\n5. Jupiter - 778,299,000 km (5.203 AU)\n6. Saturn - 1,426,666,400 km (8.388 AU)\n7. Uranus - 2,870,972,200 km (19.218 AU)\n8. Neptune - 4,497,072,000 km (30.05 AU)\n\nNote: One astronomical unit (AU) is the average distance between the Earth and the Sun, which is approximately 93 million miles or 149.6 million kilometers.', 'index': 0, 'logprobs': None, 'finish_reason': 'stop'}], 'usage': {'

In [8]:
print(output["choices"][0]["text"])

Name all the planets in the solar system and state their distances to the sun.

There are eight planets in the solar system: Mercury, Venus, Earth, Mars, Jupiter, Saturn, Uranus, and Neptune. Here is a list of the planets in order from closest to farthest from the Sun:

1. Mercury - 57,909,227 km (0.387 AU)
2. Venus - 108,208,930 km (0.723 AU)
3. Earth - 149,597,870 km (1 AU)
4. Mars - 225,000,000 km (1.381 AU)
5. Jupiter - 778,299,000 km (5.203 AU)
6. Saturn - 1,426,666,400 km (8.388 AU)
7. Uranus - 2,870,972,200 km (19.218 AU)
8. Neptune - 4,497,072,000 km (30.05 AU)

Note: One astronomical unit (AU) is the average distance between the Earth and the Sun, which is approximately 93 million miles or 149.6 million kilometers.


## Using CPU

In [9]:
llm = Llama(model_path=llama_2_path)

llama_model_loader: loaded meta data with 19 key-value pairs and 363 tensors from ../../gguf_models/llama-2-13b-chat.Q6_K.gguf (version GGUF V2 (latest))
llama_model_loader: - tensor    0:                token_embd.weight q6_K     [  5120, 32000,     1,     1 ]
llama_model_loader: - tensor    1:           blk.0.attn_norm.weight f32      [  5120,     1,     1,     1 ]
llama_model_loader: - tensor    2:            blk.0.ffn_down.weight q6_K     [ 13824,  5120,     1,     1 ]
llama_model_loader: - tensor    3:            blk.0.ffn_gate.weight q6_K     [  5120, 13824,     1,     1 ]
llama_model_loader: - tensor    4:              blk.0.ffn_up.weight q6_K     [  5120, 13824,     1,     1 ]
llama_model_loader: - tensor    5:            blk.0.ffn_norm.weight f32      [  5120,     1,     1,     1 ]
llama_model_loader: - tensor    6:              blk.0.attn_k.weight q6_K     [  5120,  5120,     1,     1 ]
llama_model_loader: - tensor    7:         blk.0.attn_output.weight q6_K     [  5120,  512

In [10]:
output = llm(prompt_example, max_tokens=512, echo=True)


llama_print_timings:        load time =  2349.61 ms
llama_print_timings:      sample time =   258.02 ms /   329 runs   (    0.78 ms per token,  1275.08 tokens per second)
llama_print_timings: prompt eval time =  2349.57 ms /    17 tokens (  138.21 ms per token,     7.24 tokens per second)
llama_print_timings:        eval time = 44262.75 ms /   328 runs   (  134.95 ms per token,     7.41 tokens per second)
llama_print_timings:       total time = 47359.38 ms


By inspection, we can see that the metal acceleration is faster as expected.

In [11]:
print(output["choices"][0]["text"])

Name all the planets in the solar system and state their distances to the sun.
The eight planets of our solar system are: Mercury, Venus, Earth, Mars, Jupiter, Saturn, Uranus, and Neptune. The average distance from each planet to the sun is as follows:

1. Mercury - 57,909,227 km (0.387 AU)
2. Venus - 108,208,930 km (0.723 AU)
3. Earth - 149,597,890 km (1 AU)
4. Mars - 225,000,000 km (1.381 AU)
5. Jupiter - 778,299,000 km (5.203 AU)
6. Saturn - 1,426,666,400 km (8.388 AU)
7. Uranus - 2,870,972,200 km (19.18 AU)
8. Neptune - 4,497,072,000 km (30.05 AU)

Note: One astronomical unit (AU) is the average distance between the Earth and the sun, which is about 149,600,000 km or 92,955,800 miles.


## Using Llama2 in `llama-index`

In [12]:
llm = LlamaCPP(
    model_path=llama_2_path,
    temperature=0.1,
    max_new_tokens=1024,
    context_window=3900,
    # kwargs to pass to __call__()
    generate_kwargs={},
    # kwargs to pass to __init__()
    # set to at least 1 to use GPU
    model_kwargs={"n_gpu_layers": 1},
    # transform inputs into Llama2 format
    messages_to_prompt=messages_to_prompt,
    completion_to_prompt=completion_to_prompt,
    verbose=True,
)

llama_model_loader: loaded meta data with 19 key-value pairs and 363 tensors from ../../gguf_models/llama-2-13b-chat.Q6_K.gguf (version GGUF V2 (latest))
llama_model_loader: - tensor    0:                token_embd.weight q6_K     [  5120, 32000,     1,     1 ]
llama_model_loader: - tensor    1:           blk.0.attn_norm.weight f32      [  5120,     1,     1,     1 ]
llama_model_loader: - tensor    2:            blk.0.ffn_down.weight q6_K     [ 13824,  5120,     1,     1 ]
llama_model_loader: - tensor    3:            blk.0.ffn_gate.weight q6_K     [  5120, 13824,     1,     1 ]
llama_model_loader: - tensor    4:              blk.0.ffn_up.weight q6_K     [  5120, 13824,     1,     1 ]
llama_model_loader: - tensor    5:            blk.0.ffn_norm.weight f32      [  5120,     1,     1,     1 ]
llama_model_loader: - tensor    6:              blk.0.attn_k.weight q6_K     [  5120,  5120,     1,     1 ]
llama_model_loader: - tensor    7:         blk.0.attn_output.weight q6_K     [  5120,  512

In [13]:
llm.metadata

LLMMetadata(context_window=3900, num_output=1024, is_chat_model=False, is_function_calling_model=False, model_name='../../gguf_models/llama-2-13b-chat.Q6_K.gguf')

In [14]:
handbook = pd.read_csv("../../data/public/handbook-scraped.csv")
wiki = pd.read_csv("../../data/turing_internal/wiki-scraped.csv")
# turing = pd.read_csv("../../data/public/turingacuk-no-boilerplate.csv")

text_list = list(handbook["body"].astype("str")) + list(wiki["body"].astype("str"))
documents = [Document(text=t) for t in text_list]

In [15]:
hfemb = HuggingFaceEmbeddings()
embed_model = LangchainEmbedding(hfemb)

In [16]:
hfemb

HuggingFaceEmbeddings(client=SentenceTransformer(
  (0): Transformer({'max_seq_length': 384, 'do_lower_case': False}) with Transformer model: MPNetModel 
  (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False})
  (2): Normalize()
), model_name='sentence-transformers/all-mpnet-base-v2', cache_folder=None, model_kwargs={}, encode_kwargs={}, multi_process=False)

In [17]:
# set number of output tokens
num_output = 1024
# set maximum input size
context_window = 4096
# set maximum chunk overlap
chunk_size_limit = 512
chunk_overlap_ratio = 0.1

prompt_helper = PromptHelper(
    context_window=context_window,
    num_output=num_output,
    chunk_size_limit=chunk_size_limit,
    chunk_overlap_ratio=chunk_overlap_ratio,
)

In [18]:
 service_context = ServiceContext.from_defaults(
    llm_predictor=LLMPredictor(llm=llm),
    embed_model=embed_model,
    prompt_helper=prompt_helper,
    chunk_size=chunk_size_limit,
)

index = VectorStoreIndex.from_documents(
    documents, service_context=service_context
)

In [19]:
response_mode = "simple_summarize"
similarity_top_k = 3

In [20]:
query_engine = index.as_query_engine(response_mode=response_mode,
                                     similarity_top_k=similarity_top_k)

In [21]:
response = query_engine.query("what should a new starter in REG do?")
print(response.response)

  As a new starter in REG, you should:

1. Attend buddy meetings with your assigned buddies to get familiarized with the team and ask any questions you may have.
2. Attend HR induction and IT induction meetings to discuss general information such as pay, health, leaves, benefits, and accounts.
3. Meet with your line manager to discuss your role and responsibilities.
4. Shadow meetings across the group to get a feel for how REG works and meet people.
5. Complete all necessary forms on Cezanne, including personal details, bank details, NI Health form, Additional characteristics form, and signed and dated scanned contract.
6. Upload your photo to the Documents section on Cezanne.
7. Request a British Library pass to access the office.
8. Complete Agenda screening (if you haven't already done so).
9. Read about health and dental insurance and decide whether to sign up.
10. Check the Turing Benefits site for useful discounts.
11. Provide a description for your public profile on the Turing w


llama_print_timings:        load time =  9387.84 ms
llama_print_timings:      sample time =   254.43 ms /   363 runs   (    0.70 ms per token,  1426.70 tokens per second)
llama_print_timings: prompt eval time = 29778.92 ms /  1296 tokens (   22.98 ms per token,    43.52 tokens per second)
llama_print_timings:        eval time = 41385.82 ms /   362 runs   (  114.33 ms per token,     8.75 tokens per second)
llama_print_timings:       total time = 71899.77 ms


In [22]:
len(response.source_nodes)

3

In [23]:
response = query_engine.query("What is the starting salary for a standard RSE in REG?")
print(response.response)

Llama.generate: prefix-match hit


  Based on the provided context information, the starting salary for a standard RSE in REG would be £40,000. This is the bottom third baseline for the Standard role in the 2023-24 Institute-wide HERA Bands.



llama_print_timings:        load time =  9387.84 ms
llama_print_timings:      sample time =    42.72 ms /    61 runs   (    0.70 ms per token,  1427.84 tokens per second)
llama_print_timings: prompt eval time = 49457.50 ms /  1809 tokens (   27.34 ms per token,    36.58 tokens per second)
llama_print_timings:        eval time =  7267.23 ms /    60 runs   (  121.12 ms per token,     8.26 tokens per second)
llama_print_timings:       total time = 56845.16 ms


## Context chat engine

In [24]:
system_prompt=(
    "You are a helpful assistant, able to have normal interactions, "
    # "as well as talk about the Research Engineering Group (REG) "
    # "and The Alan Turing Institute based on the context provided. "
    "Please answer questions with the context provided if it is relevant. "
    "If the context is not related to the question or message, answer normally. "
    "Do not speculate or make up information. "
    "Do not reference any given instructions or context. "
    "Do not thank me for any additional context. "
)

In [25]:
system_prompt

'You are a helpful assistant, able to have normal interactions, Please answer questions with the context provided if it is relevant. If the context is not related to the question or message, answer normally. Do not speculate or make up information. Do not reference any given instructions or context. Do not thank me for any additional context. '

In [26]:
chat_engine = index.as_chat_engine(
    chat_mode="context",
    response_mode=response_mode,
    similarity_top_k=similarity_top_k,
    system_prompt=system_prompt,
)

In [27]:
chat_engine._prefix_messages

[ChatMessage(role=<MessageRole.SYSTEM: 'system'>, content='You are a helpful assistant, able to have normal interactions, Please answer questions with the context provided if it is relevant. If the context is not related to the question or message, answer normally. Do not speculate or make up information. Do not reference any given instructions or context. Do not thank me for any additional context. ', additional_kwargs={})]

In [28]:
chat_engine._llm.chat

<bound method llm_chat_callback.<locals>.wrap.<locals>.wrapped_llm_chat of LlamaCPP(callback_manager=<llama_index.callbacks.base.CallbackManager object at 0x5391e0750>, model_url='https://huggingface.co/TheBloke/Llama-2-13B-chat-GGML/resolve/main/llama-2-13b-chat.ggmlv3.q4_0.bin', model_path='../../gguf_models/llama-2-13b-chat.Q6_K.gguf', temperature=0.1, max_new_tokens=1024, context_window=3900, messages_to_prompt=<function messages_to_prompt at 0x16346e0c0>, completion_to_prompt=<function completion_to_prompt at 0x16346e160>, generate_kwargs={'temperature': 0.1, 'max_tokens': 1024, 'stream': False}, model_kwargs={'n_gpu_layers': 1, 'n_ctx': 3900, 'verbose': True}, verbose=True)>

In [29]:
response = chat_engine.chat(
    "hello"
)
print(response)

Llama.generate: prefix-match hit


  Hello! I'm here to help with any questions you have. What would you like to know or discuss? Please keep in mind that I can only provide information based on the context provided, so if your question is not related to the context, I may not be able to assist. Go ahead and ask away!



llama_print_timings:        load time =  9387.84 ms
llama_print_timings:      sample time =    46.23 ms /    66 runs   (    0.70 ms per token,  1427.61 tokens per second)
llama_print_timings: prompt eval time =  9458.20 ms /   536 tokens (   17.65 ms per token,    56.67 tokens per second)
llama_print_timings:        eval time =  6521.28 ms /    65 runs   (  100.33 ms per token,     9.97 tokens per second)
llama_print_timings:       total time = 16106.92 ms


In [30]:
response = chat_engine.chat(
    "what should a new starter in REG do?"
)
print(response)

Llama.generate: prefix-match hit


  Based on the provided context, here are some tasks that a new starter in REG might consider doing:

1. Meet with their buddies: REG provides two buddies for each new starter to be friendly points of contact and help with any questions or issues.
2. Attend HR induction: HR will set up a meeting with the new starter to discuss general information such as pay, health, leaves, and benefits.
3. Attend IT induction: IT will meet the new starter to discuss accounts and Turing wide systems.
4. Shadow meetings: REG offers new starters the opportunity to shadow meetings across the group to meet people and get a feel for how they work.
5. Complete Agenda screening: HR requires all new starters to complete an Agenda screening report before starting.
6. Upload personal details and documents to Cezanne: New starters should enter specific personal details, such as DOB, home address, emergency contact details, and bank details, and upload a photo to the Documents section on Cezanne.
7. Complete and 


llama_print_timings:        load time =  9387.84 ms
llama_print_timings:      sample time =   423.75 ms /   603 runs   (    0.70 ms per token,  1423.00 tokens per second)
llama_print_timings: prompt eval time = 31744.09 ms /  1359 tokens (   23.36 ms per token,    42.81 tokens per second)
llama_print_timings:        eval time = 70752.81 ms /   602 runs   (  117.53 ms per token,     8.51 tokens per second)
llama_print_timings:       total time = 103788.19 ms


In [31]:
response = chat_engine.chat(
    "I've already completed my inductions and uploaded my "
    "documents for Cezanne and the ATI website, what else is there to do?"
)
print(response)

Llama.generate: prefix-match hit


  Great, it sounds like you have already taken care of some of the essential tasks for new starters in REG! Here are a few more things you might consider doing:

1. Familiarize yourself with the REG handbook: The REG handbook provides an overview of how REG operates, including information on projects, service areas, and 22 days time.
2. Explore the project tracker: The project tracker is a tool used to express preferences on upcoming projects and track progress. You can browse the tracker to get an idea of what REG is working on and express your interests in specific projects.
3. Join the #new-starters Slack channel: This channel is a great place to connect with other new starters and ask questions or share information.
4. Attend tech talks: REG runs tech talks every Tuesday lunchtime, which cover a range of topics related to research and technology. You can find the upcoming topics on the REG Tech Talks Slack channel.
5. Check out the Turing Data Stories (TDS) repository: TDS is a sep


llama_print_timings:        load time =  9387.84 ms
llama_print_timings:      sample time =   353.78 ms /   505 runs   (    0.70 ms per token,  1427.43 tokens per second)
llama_print_timings: prompt eval time = 71155.50 ms /  2339 tokens (   30.42 ms per token,    32.87 tokens per second)
llama_print_timings:        eval time = 66587.39 ms /   504 runs   (  132.12 ms per token,     7.57 tokens per second)
llama_print_timings:       total time = 138794.64 ms


In [32]:
response = chat_engine.chat(
    "thanks! what is the link to the project tracker?"
)
print(response)

Llama.generate: prefix-match hit


  You're welcome! The link to the REG project tracker is:

<https://alan-turing-institute.github.io/REG-handbook/docs/our_projects/project_tracker/>

This page displays all upcoming, current, and completed projects in REG, along with information about each project's status and the issue number in the Hut23 repository. You can use this tracker to express preferences on upcoming projects and track progress.

Please note that you may need to log in to access some of the links or features on the project tracker page. If you have any questions or need help finding something, feel free to reach out to your line manager or buddies for assistance.



llama_print_timings:        load time =  9387.84 ms
llama_print_timings:      sample time =   113.80 ms /   162 runs   (    0.70 ms per token,  1423.60 tokens per second)
llama_print_timings: prompt eval time = 79735.58 ms /  2512 tokens (   31.74 ms per token,    31.50 tokens per second)
llama_print_timings:        eval time = 21263.09 ms /   161 runs   (  132.07 ms per token,     7.57 tokens per second)
llama_print_timings:       total time = 101326.02 ms


In [33]:
response = chat_engine.chat(
    "that link doesn't seem to be right. the project tracker is in the Hut23 GitHub repo"
)
print(response)

Llama.generate: prefix-match hit


  Oh dear! It looks like I made a mistake. Thank you for letting me know.

The REG project tracker is indeed located in the Hut23 GitHub repository, and you can access it by following these steps:

1. Go to the Hut23 GitHub repository: <https://alan-turing-institute.github.io/Hut23/>
2. Click on the "Projects" tab in the top navigation menu.
3. You will see a list of all upcoming, current, and completed projects in REG, along with information about each project's status and the issue number in the Hut23 repository.

You can use this tracker to express preferences on upcoming projects and track progress. If you have any questions or need help finding something, feel free to reach out to your line manager or buddies for assistance.



llama_print_timings:        load time =  9387.84 ms
llama_print_timings:      sample time =   133.99 ms /   191 runs   (    0.70 ms per token,  1425.46 tokens per second)
llama_print_timings: prompt eval time = 92744.62 ms /  2632 tokens (   35.24 ms per token,    28.38 tokens per second)
llama_print_timings:        eval time = 26045.85 ms /   190 runs   (  137.08 ms per token,     7.29 tokens per second)
llama_print_timings:       total time = 119174.93 ms


Ran out of context length after this.

In [34]:
chat_engine.chat_history

[ChatMessage(role=<MessageRole.USER: 'user'>, content='hello', additional_kwargs={}),
 ChatMessage(role=<MessageRole.ASSISTANT: 'assistant'>, content="  Hello! I'm here to help with any questions you have. What would you like to know or discuss? Please keep in mind that I can only provide information based on the context provided, so if your question is not related to the context, I may not be able to assist. Go ahead and ask away!", additional_kwargs={}),
 ChatMessage(role=<MessageRole.USER: 'user'>, content='what should a new starter in REG do?', additional_kwargs={}),
 ChatMessage(role=<MessageRole.ASSISTANT: 'assistant'>, content='  Based on the provided context, here are some tasks that a new starter in REG might consider doing:\n\n1. Meet with their buddies: REG provides two buddies for each new starter to be friendly points of contact and help with any questions or issues.\n2. Attend HR induction: HR will set up a meeting with the new starter to discuss general information such 

## Context chat example for obtaining starting salary

In [35]:
chat_engine = index.as_chat_engine(
    chat_mode="context",
    response_mode=response_mode,
    similarity_top_k=similarity_top_k,
    system_prompt=system_prompt,
)

In [36]:
response = chat_engine.chat(
    "what is the starting salary for a standard RSE in REG?"
)
print(response)

Llama.generate: prefix-match hit


  Based on the provided context, the starting salary for a Standard RSE in REG is £40,000. This is the bottom third baseline for the Standard band, which is £40,000 - £48,491.



llama_print_timings:        load time =  9387.84 ms
llama_print_timings:      sample time =    41.39 ms /    59 runs   (    0.70 ms per token,  1425.47 tokens per second)
llama_print_timings: prompt eval time = 50427.23 ms /  1866 tokens (   27.02 ms per token,    37.00 tokens per second)
llama_print_timings:        eval time =  7028.91 ms /    58 runs   (  121.19 ms per token,     8.25 tokens per second)
llama_print_timings:       total time = 57572.15 ms


In [37]:
response = chat_engine.chat(
    "is that for 2023/24?"
)
print(response)

Llama.generate: prefix-match hit


  Yes, the starting salary of £40,000 is for the 2023/24 financial year, as mentioned in the context. The cost of living increase for 2023 is 5%, and the cumulative cost of living increase from 2020 to 2023 is 20.74%.



llama_print_timings:        load time =  9387.84 ms
llama_print_timings:      sample time =    55.32 ms /    79 runs   (    0.70 ms per token,  1427.95 tokens per second)
llama_print_timings: prompt eval time = 34095.64 ms /  1427 tokens (   23.89 ms per token,    41.85 tokens per second)
llama_print_timings:        eval time =  8928.90 ms /    78 runs   (  114.47 ms per token,     8.74 tokens per second)
llama_print_timings:       total time = 43178.32 ms


In [38]:
response = chat_engine.chat(
    "can you show me the table for the 2023/24 salaries?"
)
print(response)

Llama.generate: prefix-match hit


  Certainly! Here's the table for the 2023/24 salaries based on the provided context:

| Role | Band | Role Salary Min | Role Salary Max | Bottom Third Baseline | Middle Third Baseline | Top Third Baseline |
| --- | --- | --- | --- | --- | --- | --- |
| Principal | 6 | £73,526 | £84,488 | £73,526 | £77,180 | £80,834 |
| Lead | 5 | £62,666 | £73,297 | £62,666 | £66,210 | £69,754 |
| Senior | 4 | £51,476 | £62,108 | £51,476 | £55,020 | £58,564 |
| Standard | 3b* | £42,000 | £50,916 | £42,000 | £44,972 | £47,944 |
| Junior | 3a* | £38,048 | £39,900 | £38,048 | £38,665 | £39,283 |

Note that the table only shows the salary ranges for the REG roles, as the other roles are not relevant to the context. Also, the bottom third baseline is the starting salary for a new hire in the role, while the middle and top third baselines represent the salary progression for existing employees based on their performance and experience.



llama_print_timings:        load time =  9387.84 ms
llama_print_timings:      sample time =   282.04 ms /   396 runs   (    0.71 ms per token,  1404.07 tokens per second)
llama_print_timings: prompt eval time = 59088.33 ms /  2057 tokens (   28.73 ms per token,    34.81 tokens per second)
llama_print_timings:        eval time = 50284.80 ms /   395 runs   (  127.30 ms per token,     7.86 tokens per second)
llama_print_timings:       total time = 110246.08 ms


## "React" chat engine

In [39]:
chat_engine = index.as_chat_engine(chat_mode="react",
                                   response_mode=response_mode,
                                   similarity_top_k=similarity_top_k,
                                   verbose=True)

In [40]:
response = chat_engine.chat(
    "what should a new starter in REG do?"
)
print(response)

Llama.generate: prefix-match hit


[38;5;200m[1;3mThought: I need to use a tool to help me answer the question.
Action: query_engine_tool
Action Input: {'input': 'What should a new starter in REG do?'}
[0m


llama_print_timings:        load time =  9387.84 ms
llama_print_timings:      sample time =    39.60 ms /    56 runs   (    0.71 ms per token,  1414.21 tokens per second)
llama_print_timings: prompt eval time =  7363.70 ms /   441 tokens (   16.70 ms per token,    59.89 tokens per second)
llama_print_timings:        eval time =  5465.05 ms /    55 runs   (   99.36 ms per token,    10.06 tokens per second)
llama_print_timings:       total time = 12942.65 ms
Llama.generate: prefix-match hit


[36;1m[1;3mObservation:   As a new starter in REG, you should:

1. Attend buddy meetings with your assigned buddies to get familiarized with the team and ask any questions you may have.
2. Attend HR induction and IT induction meetings to complete necessary paperwork and set up accounts.
3. Meet with your line manager to discuss your role, responsibilities, and project assignments.
4. Shadow meetings across the group to get a feel for how REG works and meet people.
5. Complete all necessary forms and tasks on Cezanne, including updating personal details, completing health and safety forms, and signing the "Right to Work" document.
6. Request a British Library pass to access the office.
7. Read about health and dental insurance options and decide whether to sign up.
8. Check the Turing Benefits site for discounts and benefits.
9. Provide a description for your profile on the ATI website.
10. Verify your MoorePay account for payslips.

It is also recommended that you:

* Join in for wel


llama_print_timings:        load time =  9387.84 ms
llama_print_timings:      sample time =   228.08 ms /   320 runs   (    0.71 ms per token,  1403.03 tokens per second)
llama_print_timings: prompt eval time = 29339.14 ms /  1282 tokens (   22.89 ms per token,    43.70 tokens per second)
llama_print_timings:        eval time = 36476.99 ms /   319 runs   (  114.35 ms per token,     8.75 tokens per second)
llama_print_timings:       total time = 66512.02 ms
Llama.generate: prefix-match hit


[38;5;200m[1;3mResponse: As a new starter in REG, you should attend buddy meetings with your assigned buddies to get familiarized with the team and ask any questions you may have. You should also attend HR induction and IT induction meetings, meet with your line manager to discuss your role and project assignments, shadow meetings across the group, complete all necessary forms and tasks on Cezanne, request a British Library pass, read about health and dental insurance options, check the Turing Benefits site for discounts and benefits, provide a description for your profile on the ATI website, and verify your MoorePay account for payslips. Additionally, you should join in for welcome coffee(s) to introduce yourself to the whole REG team, attend 1-on-1 meetings with REG's Director within the first few weeks of starting, and use the first few days to set up your laptop and tools, get familiarized with the internal wiki and handbook, and shadow meetings.
[0mAs a new starter in REG, you 


llama_print_timings:        load time =  9387.84 ms
llama_print_timings:      sample time =   161.03 ms /   230 runs   (    0.70 ms per token,  1428.29 tokens per second)
llama_print_timings: prompt eval time = 15762.62 ms /   817 tokens (   19.29 ms per token,    51.83 tokens per second)
llama_print_timings:        eval time = 24342.82 ms /   229 runs   (  106.30 ms per token,     9.41 tokens per second)
llama_print_timings:       total time = 40567.23 ms


In [41]:
response = chat_engine.chat("What did I ask you before?")
print(response)

Llama.generate: prefix-match hit


[38;5;200m[1;3mResponse:   You asked me: what should a new starter in REG do?
[0m  You asked me: what should a new starter in REG do?



llama_print_timings:        load time =  9387.84 ms
llama_print_timings:      sample time =    11.91 ms /    17 runs   (    0.70 ms per token,  1426.89 tokens per second)
llama_print_timings: prompt eval time =  4737.11 ms /   231 tokens (   20.51 ms per token,    48.76 tokens per second)
llama_print_timings:        eval time =  1635.97 ms /    16 runs   (  102.25 ms per token,     9.78 tokens per second)
llama_print_timings:       total time =  6406.09 ms


In [42]:
response = chat_engine.chat("Have you used the query engine yet?")
print(response)

Llama.generate: prefix-match hit


[38;5;200m[1;3mResponse:   No, I have not used the query engine yet.
[0m  No, I have not used the query engine yet.



llama_print_timings:        load time =  9387.84 ms
llama_print_timings:      sample time =     9.11 ms /    13 runs   (    0.70 ms per token,  1427.00 tokens per second)
llama_print_timings: prompt eval time =   980.09 ms /    36 tokens (   27.22 ms per token,    36.73 tokens per second)
llama_print_timings:        eval time =  1232.60 ms /    12 runs   (  102.72 ms per token,     9.74 tokens per second)
llama_print_timings:       total time =  2237.77 ms


In [43]:
response = chat_engine.chat("What have I asked you so far?")
print(response)

Llama.generate: prefix-match hit


[38;5;200m[1;3mResponse:   You have asked me:

1. What should a new starter in REG do?
2. Have you used the query engine yet?
[0m  You have asked me:

1. What should a new starter in REG do?
2. Have you used the query engine yet?



llama_print_timings:        load time =  9387.84 ms
llama_print_timings:      sample time =    23.11 ms /    33 runs   (    0.70 ms per token,  1427.83 tokens per second)
llama_print_timings: prompt eval time =   694.77 ms /    32 tokens (   21.71 ms per token,    46.06 tokens per second)
llama_print_timings:        eval time =  3312.30 ms /    32 runs   (  103.51 ms per token,     9.66 tokens per second)
llama_print_timings:       total time =  4071.13 ms


Reset chat engine...

In [44]:
chat_engine.reset()

In [45]:
response = chat_engine.chat("What did I ask you before?")
print(response)

Llama.generate: prefix-match hit


[38;5;200m[1;3mThought: I need to use a tool to help me answer the question.
Action: query_engine_tool
Action Input: {'input': 'What did I ask you before?'}
[0m


llama_print_timings:        load time =  9387.84 ms
llama_print_timings:      sample time =    42.02 ms /    60 runs   (    0.70 ms per token,  1427.76 tokens per second)
llama_print_timings: prompt eval time =   382.90 ms /    11 tokens (   34.81 ms per token,    28.73 tokens per second)
llama_print_timings:        eval time =  5846.63 ms /    59 runs   (   99.10 ms per token,    10.09 tokens per second)
llama_print_timings:       total time =  6345.72 ms
Llama.generate: prefix-match hit


[36;1m[1;3mObservation:   Based on the current context information provided, you have not asked me any questions before.
[0m


llama_print_timings:        load time =  9387.84 ms
llama_print_timings:      sample time =    13.35 ms /    19 runs   (    0.70 ms per token,  1423.54 tokens per second)
llama_print_timings: prompt eval time =  1406.31 ms /   102 tokens (   13.79 ms per token,    72.53 tokens per second)
llama_print_timings:        eval time =  1687.97 ms /    18 runs   (   93.78 ms per token,    10.66 tokens per second)
llama_print_timings:       total time =  3130.96 ms
Llama.generate: prefix-match hit


[38;5;200m[1;3mThought: Hmm, that's correct. Let me try again.
Action: query_engine_tool
Action Input: {'input': 'What is the purpose of this conversation?'}
[0m


llama_print_timings:        load time =  9387.84 ms
llama_print_timings:      sample time =    30.17 ms /    43 runs   (    0.70 ms per token,  1425.35 tokens per second)
llama_print_timings: prompt eval time =  8934.27 ms /   508 tokens (   17.59 ms per token,    56.86 tokens per second)
llama_print_timings:        eval time =  4208.91 ms /    42 runs   (  100.21 ms per token,     9.98 tokens per second)
llama_print_timings:       total time = 13226.36 ms
Llama.generate: prefix-match hit


[36;1m[1;3mObservation:   Based on the context information provided, the purpose of this conversation is to discuss and share information related to technology, specifically about projects, data science, computer science, and software engineering. The conversation may also be used as an opportunity to seek help and input from others.
[0m


llama_print_timings:        load time =  9387.84 ms
llama_print_timings:      sample time =    38.51 ms /    55 runs   (    0.70 ms per token,  1428.16 tokens per second)
llama_print_timings: prompt eval time =  3965.45 ms /   274 tokens (   14.47 ms per token,    69.10 tokens per second)
llama_print_timings:        eval time =  5213.82 ms /    54 runs   (   96.55 ms per token,    10.36 tokens per second)
llama_print_timings:       total time =  9286.15 ms
Llama.generate: prefix-match hit

llama_print_timings:        load time =  9387.84 ms
llama_print_timings:      sample time =    16.80 ms /    24 runs   (    0.70 ms per token,  1428.74 tokens per second)
llama_print_timings: prompt eval time = 11186.44 ms /   617 tokens (   18.13 ms per token,    55.16 tokens per second)
llama_print_timings:        eval time =  2336.65 ms /    23 runs   (  101.59 ms per token,     9.84 tokens per second)
llama_print_timings:       total time = 13570.41 ms


ValueError: Could not parse output:  Thought: Ah, I see. That's helpful to know.
Action: None (for now)

## React engine and asking it to use query

We saw that it didn't use the query engine in the above, but maybe we could force it to use it...

In [46]:
chat_engine = index.as_chat_engine(chat_mode="react",
                                   response_mode=response_mode,
                                   similarity_top_k=similarity_top_k,
                                   verbose=True)

In [47]:
response = chat_engine.chat(
    "Please use the query engine. What should a new starter in the research engineering group do?"
)
print(response)

Llama.generate: prefix-match hit


[38;5;200m[1;3mThought: I need to use a tool to help me answer the question.
Action: query_engine_tool
Action Input: {'input': 'What should a new starter in the research engineering group do?'}
[0m


llama_print_timings:        load time =  9387.84 ms
llama_print_timings:      sample time =    43.88 ms /    60 runs   (    0.73 ms per token,  1367.43 tokens per second)
llama_print_timings: prompt eval time =   866.61 ms /    23 tokens (   37.68 ms per token,    26.54 tokens per second)
llama_print_timings:        eval time =  5842.61 ms /    59 runs   (   99.03 ms per token,    10.10 tokens per second)
llama_print_timings:       total time =  6827.89 ms
Llama.generate: prefix-match hit


[36;1m[1;3mObservation:   Based on the provided context information, here are some suggestions for what a new starter in the Research Engineering Group (REG) should do:

1. Familiarize yourself with the New Starter page to get an overview of the team's structure, roles, and key contacts.
2. Meet your buddies, who will provide informal friendly faces for advice, guidance, and encouragement on any aspect of working within REG and ARC. Your buddies should not be assigned to the projects you will be working on, and ideally, they should be at a similarly senior level to you.
3. Shadow projects for a short while to get an idea of how the team works.
4. Participate in "Hacktoberfest"-style issues to quickly get up to speed with the team's projects and get involved if there are any gaps in allocations.
5. Attend welcome coffee sessions to meet the team and get familiar with the group's culture and processes.
6. Check in with your buddies at least once in the first couple of weeks, and again 


llama_print_timings:        load time =  9387.84 ms
llama_print_timings:      sample time =   239.88 ms /   342 runs   (    0.70 ms per token,  1425.74 tokens per second)
llama_print_timings: prompt eval time = 31629.28 ms /  1354 tokens (   23.36 ms per token,    42.81 tokens per second)
llama_print_timings:        eval time = 39397.15 ms /   341 runs   (  115.53 ms per token,     8.66 tokens per second)
llama_print_timings:       total time = 71716.37 ms
Llama.generate: prefix-match hit


[38;5;200m[1;3mResponse: Based on the provided context information, here are some suggestions for what a new starter in the Research Engineering Group (REG) should do:
[0mBased on the provided context information, here are some suggestions for what a new starter in the Research Engineering Group (REG) should do:



llama_print_timings:        load time =  9387.84 ms
llama_print_timings:      sample time =   251.04 ms /   358 runs   (    0.70 ms per token,  1426.08 tokens per second)
llama_print_timings: prompt eval time = 16593.54 ms /   849 tokens (   19.54 ms per token,    51.16 tokens per second)
llama_print_timings:        eval time = 38505.53 ms /   357 runs   (  107.86 ms per token,     9.27 tokens per second)
llama_print_timings:       total time = 55835.46 ms


In [48]:
response = chat_engine.chat(
    "I want to specifically know about a new starter in the REG team at the Turing institute"
)
print(response)

Llama.generate: prefix-match hit


[38;5;200m[1;3mThought: I need to use a tool to help me answer the question.
Action: query_engine_tool
Action Input: {'input': 'What should a new starter in the REG team at the Turing Institute do?'}
[0m


llama_print_timings:        load time =  9387.84 ms
llama_print_timings:      sample time =    52.01 ms /    74 runs   (    0.70 ms per token,  1422.80 tokens per second)
llama_print_timings: prompt eval time =  1112.21 ms /    61 tokens (   18.23 ms per token,    54.85 tokens per second)
llama_print_timings:        eval time =  7314.27 ms /    73 runs   (  100.20 ms per token,     9.98 tokens per second)
llama_print_timings:       total time =  8572.35 ms
Llama.generate: prefix-match hit


[36;1m[1;3mObservation:   As a new starter in the REG team at the Turing Institute, you should:

1. Expect to be assigned two buddies who will be friendly points of contact for you. Your buddies will welcome you on your first day and introduce you to the rest of the team.
2. Attend a welcome coffee on your first day to meet the whole REG team.
3. Have a 1-on-1 meeting with the REG Director within the first few weeks of starting.
4. Use the time before being assigned to a project to do admin tasks, set up your laptop and tools, get to know people, read the handbook and internal wiki, and shadow meetings.
5. Sign up for the buddy system to be matched with two REG buddies who can offer informal technical help and social support.
6. Review the getting started checklist and first few days pages for more information on what to expect and how to prepare.
7. Familiarize yourself with the REG wiki, which contains a repository of knowledge helpful to the Hut 23 team, including howtos and instr


llama_print_timings:        load time =  9387.84 ms
llama_print_timings:      sample time =   260.69 ms /   372 runs   (    0.70 ms per token,  1426.95 tokens per second)
llama_print_timings: prompt eval time = 39191.95 ms /  1566 tokens (   25.03 ms per token,    39.96 tokens per second)
llama_print_timings:        eval time = 44213.05 ms /   371 runs   (  119.17 ms per token,     8.39 tokens per second)
llama_print_timings:       total time = 84170.13 ms
Llama.generate: prefix-match hit


[38;5;200m[1;3mResponse: Based on the provided information, a new starter in the REG team at the Turing Institute should take the following actions to prepare for their role and integrate into the team: attend a welcome coffee, schedule a 1-on-1 meeting with the REG Director, complete admin tasks, sign up for the buddy system, review the getting started checklist and first few days pages, familiarize themselves with the REG wiki, and take care of their wellbeing and EDI.
[0mBased on the provided information, a new starter in the REG team at the Turing Institute should take the following actions to prepare for their role and integrate into the team: attend a welcome coffee, schedule a 1-on-1 meeting with the REG Director, complete admin tasks, sign up for the buddy system, review the getting started checklist and first few days pages, familiarize themselves with the REG wiki, and take care of their wellbeing and EDI.



llama_print_timings:        load time =  9387.84 ms
llama_print_timings:      sample time =   305.88 ms /   435 runs   (    0.70 ms per token,  1422.12 tokens per second)
llama_print_timings: prompt eval time = 19322.01 ms /   945 tokens (   20.45 ms per token,    48.91 tokens per second)
llama_print_timings:        eval time = 47720.26 ms /   434 runs   (  109.95 ms per token,     9.09 tokens per second)
llama_print_timings:       total time = 67951.26 ms
