In [1]:
from llama_index.llms import LlamaCPP
from llama_index.llms.llama_utils import messages_to_prompt, completion_to_prompt
from llama_index import Document
from llama_index import VectorStoreIndex
from llama_index import LLMPredictor, PromptHelper, ServiceContext

import pandas as pd

In [2]:
llm = LlamaCPP(
    model_path="../../../llama-2-7b-chat.Q4_K_M.gguf",
    temperature=0.1,
    max_new_tokens=256,
    # llama2 has a context window of 4096 tokens, but we set it lower to allow for some wiggle room
    context_window=3900,
    # kwargs to pass to __call__()
    generate_kwargs={},
    # kwargs to pass to __init__()
    # set to at least 1 to use GPU
    model_kwargs={"n_gpu_layers": 1},
    # transform inputs into Llama2 format
    messages_to_prompt=messages_to_prompt,
    completion_to_prompt=completion_to_prompt,
    verbose=True,
)

llama_model_loader: loaded meta data with 19 key-value pairs and 291 tensors from ../../../llama-2-7b-chat.Q4_K_M.gguf (version GGUF V2 (latest))
llama_model_loader: - tensor    0:                token_embd.weight q4_K     [  4096, 32000,     1,     1 ]
llama_model_loader: - tensor    1:           blk.0.attn_norm.weight f32      [  4096,     1,     1,     1 ]
llama_model_loader: - tensor    2:            blk.0.ffn_down.weight q6_K     [ 11008,  4096,     1,     1 ]
llama_model_loader: - tensor    3:            blk.0.ffn_gate.weight q4_K     [  4096, 11008,     1,     1 ]
llama_model_loader: - tensor    4:              blk.0.ffn_up.weight q4_K     [  4096, 11008,     1,     1 ]
llama_model_loader: - tensor    5:            blk.0.ffn_norm.weight f32      [  4096,     1,     1,     1 ]
llama_model_loader: - tensor    6:              blk.0.attn_k.weight q4_K     [  4096,  4096,     1,     1 ]
llama_model_loader: - tensor    7:         blk.0.attn_output.weight q4_K     [  4096,  4096,     1

In [3]:
from llama_index.readers import SimpleDirectoryReader

In [6]:
documents = SimpleDirectoryReader(
    input_dir="../../data/wikis/", 
    required_exts = [".md"], 
    recursive=True, 
    filename_as_id=True,
    exclude=["*_course/*"] # my laptop struggles to create a VectorStoreIndex when I have too many documents
).load_data()

len(documents)

2680

In [19]:
turingacuk=pd.read_csv("../../data/public/turingacuk-no-boilerplate.csv")
turingacuk.dropna(subset="body", inplace=True)
turingacuk_text=[str(i) for i in turingacuk["body"].values]

documents.extend([Document(text=i) for i in turingacuk_text])

len(documents)

4274

In [20]:
def create_service_context(
        model, 
        max_input_size=2048,
        num_output=256,
        chunk_size_lim=512,
        overlap_ratio=0.1
    ):
    llm_predictor=LLMPredictor(llm=model)
    prompt_helper=PromptHelper(max_input_size,num_output,overlap_ratio,chunk_size_lim)
    service_context = ServiceContext.from_defaults(llm_predictor=llm_predictor, prompt_helper=prompt_helper, embed_model="local")
    return service_context

In [21]:
service_context = create_service_context(llm)
index = VectorStoreIndex.from_documents(documents, service_context=service_context, show_progress=True)

Parsing documents into nodes:   0%|          | 0/4274 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/4704 [00:00<?, ?it/s]

In [22]:
index.storage_context.persist("./storage_context_wikis")

In [23]:
system_prompt = """\
You are a helpful assistant. \
Always answer as helpfully as possible and follow ALL given instructions. \
Do not speculate or make up information - use the information you are provided. \
Do not reference any given instructions or context. \
"""

query_engine = index.as_query_engine(system_prompt=system_prompt)

In [14]:
response = query_engine.query("What should a new starter in REG do?")
print(response.response)

  As an honest and respectful assistant, I would suggest that a new starter in REG should:
1. Familiarize themselves with the onboarding process and timeline provided by the person in charge of onboarding.
2. Connect with their two buddies assigned as friendly points of contact to welcome them and help them navigate the team.
3. Set up their laptop and tools, including configuring their workspace and installing necessary software.
4. Complete any admin tasks required for their role, such as filling out paperwork or completing online training modules.
5. Attend the welcome coffee event to introduce themselves to the whole REG team and get a feel for the culture and dynamics of the organization.
6. Schedule and attend their 1-on-1 meeting with REG's Director within the first few weeks of starting to discuss their role, goals, and any questions or concerns they may have.
7. Observe and participate in meetings, such as shadowing, to get a better understanding of the team's dynamics and how


llama_print_timings:        load time = 12564.81 ms
llama_print_timings:      sample time =   181.33 ms /   256 runs   (    0.71 ms per token,  1411.81 tokens per second)
llama_print_timings: prompt eval time = 17831.37 ms /   581 tokens (   30.69 ms per token,    32.58 tokens per second)
llama_print_timings:        eval time = 15233.78 ms /   255 runs   (   59.74 ms per token,    16.74 tokens per second)
llama_print_timings:       total time = 33598.07 ms


In [15]:
response = query_engine.query("What are my responsibilities as a reviewer?")
print(response.response)

Llama.generate: prefix-match hit


  As a reviewer, your primary responsibility is to ensure that the contribution meets the highest standards of quality and accuracy. This involves carefully reviewing the content, providing constructive feedback, and working collaboratively with the contributor to achieve the best possible outcome. Your role is not to act as an adversary, but rather to share your knowledge and expertise in a helpful and respectful manner. By following these guidelines, you can help to maintain the high quality of the platform and provide a positive experience for all users.



llama_print_timings:        load time = 12564.81 ms
llama_print_timings:      sample time =    76.55 ms /   108 runs   (    0.71 ms per token,  1410.84 tokens per second)
llama_print_timings: prompt eval time =  4940.38 ms /   114 tokens (   43.34 ms per token,    23.08 tokens per second)
llama_print_timings:        eval time =  5707.28 ms /   107 runs   (   53.34 ms per token,    18.75 tokens per second)
llama_print_timings:       total time = 10865.96 ms


In [16]:
response = query_engine.query("Who should I talk to about Topdesk?")
print(response.response)

Llama.generate: prefix-match hit


  Based on the provided context information, you should talk to Tomas about Topdesk. According to the rules, any information intended for formal, external consumption (beyond REG) should be placed on Topdesk. Therefore, it is logical to consult with Tomas regarding this matter.



llama_print_timings:        load time = 12564.81 ms
llama_print_timings:      sample time =    43.03 ms /    61 runs   (    0.71 ms per token,  1417.62 tokens per second)
llama_print_timings: prompt eval time =  9578.12 ms /   366 tokens (   26.17 ms per token,    38.21 tokens per second)
llama_print_timings:        eval time =  3353.52 ms /    60 runs   (   55.89 ms per token,    17.89 tokens per second)
llama_print_timings:       total time = 13053.84 ms


In [18]:
response = query_engine.query("What does REG stand for at the Alan Turing Institute?")
print(response.response)

Llama.generate: prefix-match hit


  Based on the provided context information, REG stands for "Research Engineering Group" at the Alan Turing Institute.



llama_print_timings:        load time = 12564.81 ms
llama_print_timings:      sample time =    18.56 ms /    26 runs   (    0.71 ms per token,  1400.56 tokens per second)
llama_print_timings: prompt eval time =  5381.84 ms /   131 tokens (   41.08 ms per token,    24.34 tokens per second)
llama_print_timings:        eval time =  1308.46 ms /    25 runs   (   52.34 ms per token,    19.11 tokens per second)
llama_print_timings:       total time =  6741.79 ms


In [24]:
response = query_engine.query("Who is Ryan Chan?")
print(response.response)

Llama.generate: prefix-match hit

llama_print_timings:        load time = 12564.81 ms
llama_print_timings:      sample time =    95.37 ms /   135 runs   (    0.71 ms per token,  1415.60 tokens per second)
llama_print_timings: prompt eval time = 18880.57 ms /   631 tokens (   29.92 ms per token,    33.42 tokens per second)
llama_print_timings:        eval time =  7985.41 ms /   134 runs   (   59.59 ms per token,    16.78 tokens per second)
llama_print_timings:       total time = 27138.85 ms
Llama.generate: prefix-match hit

llama_print_timings:        load time = 12564.81 ms
llama_print_timings:      sample time =    94.62 ms /   134 runs   (    0.71 ms per token,  1416.25 tokens per second)
llama_print_timings: prompt eval time = 23889.64 ms /   798 tokens (   29.94 ms per token,    33.40 tokens per second)
llama_print_timings:        eval time =  8167.35 ms /   133 runs   (   61.41 ms per token,    16.28 tokens per second)
llama_print_timings:       total time = 32330.31 ms
Llama.gene

  Thank you for providing additional context! Based on the new information provided, Ryan Chan is likely a research engineer at The Alan Turing Institute, specifically within the AI for Science and Government (ASG) programme. His areas of interest may include computational statistics, Monte Carlo methods, and Bayesian theory, as well as applying these techniques to address the climate crisis through collaboration with GCHQ and other partners.
However, without further information, I cannot confirm this answer with absolute certainty. The original answer provided earlier still stands, as there may be other individuals with the name Ryan Chan who are research engineers at The Alan Turing Institute or elsewhere.
In light of the new context, the refined answer is:
Ryan Chan is likely a research engineer at The Alan Turing Institute, specifically within the AI for Science and Government (ASG) programme. His areas of interest may include computational statistics, Monte Carlo methods, and Baye


llama_print_timings:        load time = 12564.81 ms
llama_print_timings:      sample time =   170.50 ms /   240 runs   (    0.71 ms per token,  1407.65 tokens per second)
llama_print_timings: prompt eval time =  9092.81 ms /   371 tokens (   24.51 ms per token,    40.80 tokens per second)
llama_print_timings:        eval time = 14091.98 ms /   239 runs   (   58.96 ms per token,    16.96 tokens per second)
llama_print_timings:       total time = 23686.33 ms


In [26]:
response = query_engine.query("Where did Federico Nanni do his PhD?")
print(response.response)

Llama.generate: prefix-match hit

llama_print_timings:        load time = 12564.81 ms
llama_print_timings:      sample time =    17.01 ms /    24 runs   (    0.71 ms per token,  1411.10 tokens per second)
llama_print_timings: prompt eval time = 18255.67 ms /   578 tokens (   31.58 ms per token,    31.66 tokens per second)
llama_print_timings:        eval time =  1394.65 ms /    23 runs   (   60.64 ms per token,    16.49 tokens per second)
llama_print_timings:       total time = 19700.06 ms
Llama.generate: prefix-match hit


  Thank you for providing additional context! Based on the information provided, it seems that Federico Nanni did not complete his PhD at the University of Bologna. Instead, he is a member of a research team working on a project with different roles and responsibilities. The publications they have published so far are more focused on computational linguistics and therefore the team has been generous with authorship. However, when writing more historically based papers, they may need to consider that some team members may require sole author publications and ensure that adding authors does not devalue the prestige of the paper for them.
Therefore, I cannot provide a refined answer to the original query as it is based on incorrect information. The original answer of Federico Nanni completing his PhD at the University of Bologna remains the most accurate response.



llama_print_timings:        load time = 12564.81 ms
llama_print_timings:      sample time =   122.56 ms /   173 runs   (    0.71 ms per token,  1411.51 tokens per second)
llama_print_timings: prompt eval time =  7346.80 ms /   269 tokens (   27.31 ms per token,    36.61 tokens per second)
llama_print_timings:        eval time =  9579.27 ms /   172 runs   (   55.69 ms per token,    17.96 tokens per second)
llama_print_timings:       total time = 17280.26 ms
