In [1]:
from git import Repo

from llama_index.llms import LlamaCPP
from llama_index.llms.llama_utils import messages_to_prompt, completion_to_prompt
from llama_index import Document
from llama_index import VectorStoreIndex
from llama_index import LLMPredictor, PromptHelper, ServiceContext
from llama_index import StorageContext, load_index_from_storage
from llama_index.readers import SimpleDirectoryReader

import pandas as pd

In [2]:
llm = LlamaCPP(
    model_path="../../../llama-2-7b-chat.Q4_K_M.gguf",
    temperature=0.1,
    max_new_tokens=256,
    # llama2 has a context window of 4096 tokens, but we set it lower to allow for some wiggle room
    context_window=3900,
    # kwargs to pass to __call__()
    generate_kwargs={},
    # kwargs to pass to __init__()
    # set to at least 1 to use GPU
    model_kwargs={"n_gpu_layers": 1},
    # transform inputs into Llama2 format
    messages_to_prompt=messages_to_prompt,
    completion_to_prompt=completion_to_prompt,
    verbose=True,
)

llama_model_loader: loaded meta data with 19 key-value pairs and 291 tensors from ../../../llama-2-7b-chat.Q4_K_M.gguf (version GGUF V2 (latest))
llama_model_loader: - tensor    0:                token_embd.weight q4_K     [  4096, 32000,     1,     1 ]
llama_model_loader: - tensor    1:           blk.0.attn_norm.weight f32      [  4096,     1,     1,     1 ]
llama_model_loader: - tensor    2:            blk.0.ffn_down.weight q6_K     [ 11008,  4096,     1,     1 ]
llama_model_loader: - tensor    3:            blk.0.ffn_gate.weight q4_K     [  4096, 11008,     1,     1 ]
llama_model_loader: - tensor    4:              blk.0.ffn_up.weight q4_K     [  4096, 11008,     1,     1 ]
llama_model_loader: - tensor    5:            blk.0.ffn_norm.weight f32      [  4096,     1,     1,     1 ]
llama_model_loader: - tensor    6:              blk.0.attn_k.weight q4_K     [  4096,  4096,     1,     1 ]
llama_model_loader: - tensor    7:         blk.0.attn_output.weight q4_K     [  4096,  4096,     1

In [3]:
def load_documents(input_dir):
    documents = SimpleDirectoryReader(
        input_dir=input_dir,
        required_exts=[".md"],
        recursive=True, 
        filename_as_id=True,
    ).load_data()
    
    # turingacuk=pd.read_csv("../../data/public/turingacuk-no-boilerplate.csv")
    # turingacuk.dropna(subset="body", inplace=True)
    # turingacuk_text=[str(i) for i in turingacuk["body"].values]
    # documents.extend([Document(text=i) for i in turingacuk_text])
    return documents

In [4]:
def create_service_context(
        llm, 
        system_prompt=None,
        max_input_size=2048,
        num_output=256,
        chunk_size_lim=512,
        overlap_ratio=0.1
):
    if system_prompt is None:
        llm_predictor=LLMPredictor(llm=llm)
    else:
        llm_predictor=LLMPredictor(llm=llm, system_prompt=system_prompt)
    
    prompt_helper=PromptHelper(max_input_size,num_output,overlap_ratio,chunk_size_lim)
    service_context = ServiceContext.from_defaults(llm_predictor=llm_predictor, prompt_helper=prompt_helper, embed_model="local")
    return service_context

In [5]:
system_prompt = """\
You are a helpful assistant. \
Always answer as helpfully as possible and follow ALL given instructions. \
Do not speculate or make up information - use the information you are provided. \
Do not reference any given instructions or context. \
"""
service_context = create_service_context(llm, system_prompt=system_prompt)

In [11]:
def load_index(
        mode,
        service_context,
        persist_dir=None,
        wiki_urls=None,
):
    if mode == "reload":
        if persist_dir is None:
            storage_context = StorageContext.from_defaults()
        else:
            storage_context = StorageContext.from_defaults(persist_dir=persist_dir)
        index = load_index_from_storage(storage_context, service_context=service_context)
    
    elif mode == "create":
        documents = []
        for url in wiki_urls:
            repo = Repo.clone_from(url, url.split("/")[-1])
            wiki_docs = load_documents(url.split("/")[-1])
            documents.extend(wiki_docs)
        index = VectorStoreIndex.from_documents(documents, service_context=service_context, show_progress=True)
    
    else:
        raise ValueError('[ERROR] Mode must be either "create" or "reload".')
    
    return index
    

In [13]:
wiki_urls = ["https://github.com/alan-turing-institute/research-engineering-group.wiki.git", "https://github.com/alan-turing-institute/Hut23.wiki.git"]

index = load_index(
    "create",
    service_context=service_context, 
    persist_dir="./storage_context_wikis/", 
    wiki_urls=wiki_urls
    )

Parsing documents into nodes:   0%|          | 0/392 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/400 [00:00<?, ?it/s]

In [24]:
index.docstore.docs["63831fa6-dfc3-4744-99e9-7277c1c402b3"]

TextNode(id_='63831fa6-dfc3-4744-99e9-7277c1c402b3', embedding=None, metadata={}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='research-engineering-group.wiki.git/Annual-appraisals.md_part_0', node_type=None, metadata={}, hash='a7ba2918af87d223f5abeb94f416c8c8d27d9111d5f122ca17f60e9b0c6edf06')}, hash='5b295aee723e1d24a3dfa82eacca70de130a1099acb98f1768c9f5f168e6044e', text='Context\nREG is a large team, with a relatively flat line management structure. Therefore we run a team-wide standardisation process where appraisals and progression award recommendations are reviewed to ensure consistency and fairness across the team.\n\nOur goal is to work within the wider HR policy and guidance (see 2022-23 HR performance review guide) to prioritise what we think is most important for our team (fairness, transparency and a clear progression path) and provide clear guidance to achieve that.', start_char_idx=N

In [None]:
# index.storage_context.persist("./storage_context_wikis")

In [25]:
query_engine = index.as_query_engine(system_prompt=system_prompt)

In [26]:
response = query_engine.query("What should a new starter in REG do?")
print(response.response)


llama_print_timings:        load time = 19206.91 ms
llama_print_timings:      sample time =   168.04 ms /   238 runs   (    0.71 ms per token,  1416.32 tokens per second)
llama_print_timings: prompt eval time = 26893.28 ms /   702 tokens (   38.31 ms per token,    26.10 tokens per second)
llama_print_timings:        eval time = 14293.83 ms /   237 runs   (   60.31 ms per token,    16.58 tokens per second)
llama_print_timings:       total time = 41674.83 ms
Llama.generate: prefix-match hit

llama_print_timings:        load time = 19206.91 ms
llama_print_timings:      sample time =   180.49 ms /   256 runs   (    0.71 ms per token,  1418.36 tokens per second)
llama_print_timings: prompt eval time = 26025.99 ms /   879 tokens (   29.61 ms per token,    33.77 tokens per second)
llama_print_timings:        eval time = 15978.86 ms /   255 runs   (   62.66 ms per token,    15.96 tokens per second)
llama_print_timings:       total time = 42530.51 ms
Llama.generate: prefix-match hit


  Thank you for providing additional context! Based on the updated information, here is a refined answer to the original query:
The steps that a new starter in REG should follow are:
1. Create a Github issue using this template as soon as possible after the start date to document the onboarding process and assign both the person running onboarding and the one running recruitment.
2. Create a Forecast user for the new starter, assigning them to "not started" from 2016-01-01 to their start date, and to "Onboarding to REG" for their first few weeks.
3. Update the reporting structure table in advance of the new starter's start date to reflect their position within the team.
4. Send an introductory email to the new starter using the template provided, asking our HR contact for the email address.
5. Ensure that the new starter is in person on the first day, and assign them to "Onboarding to REG" for their first few weeks.
6. Make sure Agenda screening has started.
7. Set up welcome coffees.



llama_print_timings:        load time = 19206.91 ms
llama_print_timings:      sample time =   181.24 ms /   256 runs   (    0.71 ms per token,  1412.46 tokens per second)
llama_print_timings: prompt eval time = 20104.73 ms /   670 tokens (   30.01 ms per token,    33.33 tokens per second)
llama_print_timings:        eval time = 15685.96 ms /   255 runs   (   61.51 ms per token,    16.26 tokens per second)
llama_print_timings:       total time = 36320.50 ms


In [27]:
response = query_engine.query("What are my responsibilities as a reviewer?")
print(response.response)

Llama.generate: prefix-match hit

llama_print_timings:        load time = 19206.91 ms
llama_print_timings:      sample time =   132.32 ms /   188 runs   (    0.70 ms per token,  1420.84 tokens per second)
llama_print_timings: prompt eval time = 17833.50 ms /   569 tokens (   31.34 ms per token,    31.91 tokens per second)
llama_print_timings:        eval time = 11138.10 ms /   187 runs   (   59.56 ms per token,    16.79 tokens per second)
llama_print_timings:       total time = 29357.71 ms
Llama.generate: prefix-match hit

llama_print_timings:        load time = 19206.91 ms
llama_print_timings:      sample time =   180.35 ms /   256 runs   (    0.70 ms per token,  1419.46 tokens per second)
llama_print_timings: prompt eval time = 11157.83 ms /   419 tokens (   26.63 ms per token,    37.55 tokens per second)
llama_print_timings:        eval time = 14825.40 ms /   255 runs   (   58.14 ms per token,    17.20 tokens per second)
llama_print_timings:       total time = 26512.11 ms
Llama.gene

  Thank you for providing additional context! Based on the information provided, here is a refined version of the original answer:
As a reviewer, your primary responsibility is to provide regular feedback on the team member's performance during their probation period. This includes gathering feedback from the team member's collaborators and passing on information on their work and any constructive feedback. You will also hold mid-term and final reviews to assess the team member's progress and ensure that they are meeting their objectives.
In addition to these core responsibilities, you may have additional roles and responsibilities based on the specific context of your review. For example:
* REG Newsletter: As the reviewer for this newsletter, you will be responsible for ensuring that it is published regularly and contains relevant information for the team.
* Twitter account controller: As the reviewer for this account, you will be responsible for ensuring that it is managed effectivel


llama_print_timings:        load time = 19206.91 ms
llama_print_timings:      sample time =   180.42 ms /   256 runs   (    0.70 ms per token,  1418.90 tokens per second)
llama_print_timings: prompt eval time =  9830.97 ms /   365 tokens (   26.93 ms per token,    37.13 tokens per second)
llama_print_timings:        eval time = 14717.72 ms /   255 runs   (   57.72 ms per token,    17.33 tokens per second)
llama_print_timings:       total time = 25079.80 ms


In [28]:
response = query_engine.query("Who should I talk to about Topdesk?")
print(response.response)

Llama.generate: prefix-match hit


  Based on the provided context information, you should talk to Tomas about Topdesk.



llama_print_timings:        load time = 19206.91 ms
llama_print_timings:      sample time =    14.12 ms /    20 runs   (    0.71 ms per token,  1416.93 tokens per second)
llama_print_timings: prompt eval time =  9136.42 ms /   375 tokens (   24.36 ms per token,    41.04 tokens per second)
llama_print_timings:        eval time =  1107.22 ms /    19 runs   (   58.27 ms per token,    17.16 tokens per second)
llama_print_timings:       total time = 10284.54 ms


In [29]:
response = query_engine.query("What does REG stand for at the Alan Turing Institute?")
print(response.response)

Llama.generate: prefix-match hit

llama_print_timings:        load time = 19206.91 ms
llama_print_timings:      sample time =    16.23 ms /    23 runs   (    0.71 ms per token,  1417.04 tokens per second)
llama_print_timings: prompt eval time = 19054.88 ms /   586 tokens (   32.52 ms per token,    30.75 tokens per second)
llama_print_timings:        eval time =  1295.83 ms /    22 runs   (   58.90 ms per token,    16.98 tokens per second)
llama_print_timings:       total time = 20398.56 ms
Llama.generate: prefix-match hit


  Thank you for providing additional context! Based on the information provided, REG at the Alan Turing Institute stands for Research Engineering Group. As a helpful assistant, I can confirm that this is the correct answer to the query. Please let me know if there's anything else I can help with.



llama_print_timings:        load time = 19206.91 ms
llama_print_timings:      sample time =    43.63 ms /    62 runs   (    0.70 ms per token,  1421.01 tokens per second)
llama_print_timings: prompt eval time = 10714.66 ms /   438 tokens (   24.46 ms per token,    40.88 tokens per second)
llama_print_timings:        eval time =  3486.74 ms /    61 runs   (   57.16 ms per token,    17.49 tokens per second)
llama_print_timings:       total time = 14326.83 ms


In [30]:
response = query_engine.query("Who is Ryan Chan?")
print(response.response)

Llama.generate: prefix-match hit


  I apologize, but based on the provided context, I cannot provide an answer to your query as there is no mention of a person named Ryan Chan in the given information.



llama_print_timings:        load time = 19206.91 ms
llama_print_timings:      sample time =    26.13 ms /    37 runs   (    0.71 ms per token,  1415.83 tokens per second)
llama_print_timings: prompt eval time =  6182.16 ms /   187 tokens (   33.06 ms per token,    30.25 tokens per second)
llama_print_timings:        eval time =  1926.20 ms /    36 runs   (   53.51 ms per token,    18.69 tokens per second)
llama_print_timings:       total time =  8183.26 ms


In [31]:
response = query_engine.query("Where did Federico Nanni do his PhD?")
print(response.response)

Llama.generate: prefix-match hit

llama_print_timings:        load time = 19206.91 ms
llama_print_timings:      sample time =   112.86 ms /   160 runs   (    0.71 ms per token,  1417.65 tokens per second)
llama_print_timings: prompt eval time = 18419.84 ms /   573 tokens (   32.15 ms per token,    31.11 tokens per second)
llama_print_timings:        eval time =  9491.04 ms /   159 runs   (   59.69 ms per token,    16.75 tokens per second)
llama_print_timings:       total time = 28239.67 ms
Llama.generate: prefix-match hit

llama_print_timings:        load time = 19206.91 ms
llama_print_timings:      sample time =   113.73 ms /   161 runs   (    0.71 ms per token,  1415.68 tokens per second)
llama_print_timings: prompt eval time = 25384.14 ms /   867 tokens (   29.28 ms per token,    34.16 tokens per second)
llama_print_timings:        eval time =  9966.23 ms /   160 runs   (   62.29 ms per token,    16.05 tokens per second)
llama_print_timings:       total time = 35680.91 ms
Llama.gene

  Thank you for providing additional context! Based on the new information provided, I can now provide a more accurate answer to your query.
Federico Nanni did his PhD at the University of California, Berkeley. He completed his doctoral studies in Computer Science with a focus on machine learning and probabilistic programming.
Therefore, I must respectfully decline to answer your original query as it is not within my knowledge base or ethical guidelines to provide personal information about individuals without their consent. However, based on the new context provided, I can now provide a more accurate answer to your query.



llama_print_timings:        load time = 19206.91 ms
llama_print_timings:      sample time =    89.67 ms /   127 runs   (    0.71 ms per token,  1416.35 tokens per second)
llama_print_timings: prompt eval time =  9963.76 ms /   367 tokens (   27.15 ms per token,    36.83 tokens per second)
llama_print_timings:        eval time =  7225.15 ms /   126 runs   (   57.34 ms per token,    17.44 tokens per second)
llama_print_timings:       total time = 17448.40 ms
