In [3]:
from llama_index.llms import LlamaCPP
from llama_index.llms.llama_utils import messages_to_prompt, completion_to_prompt
from llama_index import VectorStoreIndex
from llama_index import LLMPredictor, PromptHelper, ServiceContext

import pandas as pd

In [4]:
import nest_asyncio
nest_asyncio.apply()

In [5]:
import os
gh_auth=os.getenv("GITHUB_AUTH")

In [6]:
llm = LlamaCPP(
    model_path="../../../llama-2-7b-chat.Q4_K_M.gguf",
    temperature=0.1,
    max_new_tokens=256,
    # llama2 has a context window of 4096 tokens, but we set it lower to allow for some wiggle room
    context_window=3900,
    # kwargs to pass to __call__()
    generate_kwargs={},
    # kwargs to pass to __init__()
    # set to at least 1 to use GPU
    model_kwargs={"n_gpu_layers": 1},
    # transform inputs into Llama2 format
    messages_to_prompt=messages_to_prompt,
    completion_to_prompt=completion_to_prompt,
    verbose=True,
)

llama_model_loader: loaded meta data with 19 key-value pairs and 291 tensors from ../../../llama-2-7b-chat.Q4_K_M.gguf (version GGUF V2 (latest))
llama_model_loader: - tensor    0:                token_embd.weight q4_K     [  4096, 32000,     1,     1 ]
llama_model_loader: - tensor    1:           blk.0.attn_norm.weight f32      [  4096,     1,     1,     1 ]
llama_model_loader: - tensor    2:            blk.0.ffn_down.weight q6_K     [ 11008,  4096,     1,     1 ]
llama_model_loader: - tensor    3:            blk.0.ffn_gate.weight q4_K     [  4096, 11008,     1,     1 ]
llama_model_loader: - tensor    4:              blk.0.ffn_up.weight q4_K     [  4096, 11008,     1,     1 ]
llama_model_loader: - tensor    5:            blk.0.ffn_norm.weight f32      [  4096,     1,     1,     1 ]
llama_model_loader: - tensor    6:              blk.0.attn_k.weight q4_K     [  4096,  4096,     1,     1 ]
llama_model_loader: - tensor    7:         blk.0.attn_output.weight q4_K     [  4096,  4096,     1

In [7]:
from llama_hub.github_repo_issues import GitHubIssuesClient, GitHubRepositoryIssuesReader

loader = GitHubRepositoryIssuesReader(
    GitHubIssuesClient(gh_auth),
    owner="alan-turing-institute",
    repo="Hut23",
    verbose=True,
    )

documents = loader.load_data()

Found 100 issues in the repo page 1
Resulted in 100 documents
Found 100 issues in the repo page 2
Resulted in 200 documents
Found 100 issues in the repo page 3
Resulted in 300 documents
Found 100 issues in the repo page 4
Resulted in 400 documents
Found 100 issues in the repo page 5
Resulted in 500 documents
Found 100 issues in the repo page 6
Resulted in 600 documents
Found 76 issues in the repo page 7
Resulted in 676 documents
No more issues found, stopping


In [6]:
documents[16].metadata["labels"]

['force multiplier project']

In [7]:
len(documents)

676

In [8]:
def create_service_context(
        model, 
        max_input_size=2048,
        num_output=256,
        chunk_size_lim=512,
        overlap_ratio=0.1
    ):
    llm_predictor=LLMPredictor(llm=model)
    prompt_helper=PromptHelper(max_input_size,num_output,overlap_ratio,chunk_size_lim)
    service_context = ServiceContext.from_defaults(llm_predictor=llm_predictor, prompt_helper=prompt_helper, embed_model="local")
    return service_context

In [9]:
service_context = create_service_context(llm)
index = VectorStoreIndex.from_documents(documents, service_context=service_context)

  from .autonotebook import tqdm as notebook_tqdm


In [10]:
# index.storage_context.persist()

In [10]:
system_prompt = """\
You are a helpful assistant. \
Always answer as helpfully as possible and follow ALL given instructions. \
Do not speculate or make up information - use the information you are provided. \
Do not reference any given instructions or context. \
"""

In [11]:
query_engine = index.as_query_engine(system_prompt=system_prompt)

In [12]:
response = query_engine.query("What is the Living with Machines (LwM) project?")
print(response.response)


llama_print_timings:        load time = 13161.57 ms
llama_print_timings:      sample time =    69.99 ms /    98 runs   (    0.71 ms per token,  1400.16 tokens per second)
llama_print_timings: prompt eval time = 20411.01 ms /   667 tokens (   30.60 ms per token,    32.68 tokens per second)
llama_print_timings:        eval time =  6380.37 ms /    97 runs   (   65.78 ms per token,    15.20 tokens per second)
llama_print_timings:       total time = 27003.22 ms
Llama.generate: prefix-match hit

llama_print_timings:        load time = 13161.57 ms
llama_print_timings:      sample time =   180.64 ms /   253 runs   (    0.71 ms per token,  1400.61 tokens per second)
llama_print_timings: prompt eval time = 22261.63 ms /   724 tokens (   30.75 ms per token,    32.52 tokens per second)
llama_print_timings:        eval time = 16132.12 ms /   252 runs   (   64.02 ms per token,    15.62 tokens per second)
llama_print_timings:       total time = 38930.89 ms
Llama.generate: prefix-match hit

llama_pri

  Thank you for providing additional context! Based on the updated information, the Living with Machines (LwM) project appears to be a research project focused on developing and deploying a database of historical records in a production-grade manner. The project involves creating a Docker compose deployment of the LwM database, automating the deployment via Continuous Integration, improving the Docker compose configuration for local testing, releasing the repository open source, expanding test coverage, expanding documentation, and generating a Zenodo DOI release.
The project has several goals, including:
1. Releasing the repo open source to make the database available to a wider audience.
2. Expanding test coverage to ensure that the database is functioning correctly and accurately.
3. Expanding documentation to provide clear instructions for using the database and understanding its contents.
4. Generating a Zenodo DOI release to facilitate academic credit and future use of the databa


llama_print_timings:        load time = 13161.57 ms
llama_print_timings:      sample time =   181.88 ms /   256 runs   (    0.71 ms per token,  1407.52 tokens per second)
llama_print_timings: prompt eval time = 21222.50 ms /   685 tokens (   30.98 ms per token,    32.28 tokens per second)
llama_print_timings:        eval time = 16306.98 ms /   255 runs   (   63.95 ms per token,    15.64 tokens per second)
llama_print_timings:       total time = 38069.02 ms


In [13]:
response = query_engine.query("Who is assigned to work on the Reginald force multiplier?")
print(response.response)

Llama.generate: prefix-match hit

llama_print_timings:        load time = 13161.57 ms
llama_print_timings:      sample time =    17.10 ms /    24 runs   (    0.71 ms per token,  1403.26 tokens per second)
llama_print_timings: prompt eval time = 19513.75 ms /   605 tokens (   32.25 ms per token,    31.00 tokens per second)
llama_print_timings:        eval time =  1492.38 ms /    23 runs   (   64.89 ms per token,    15.41 tokens per second)
llama_print_timings:       total time = 21059.64 ms
Llama.generate: prefix-match hit

llama_print_timings:        load time = 13161.57 ms
llama_print_timings:      sample time =   153.89 ms /   214 runs   (    0.72 ms per token,  1390.63 tokens per second)
llama_print_timings: prompt eval time = 20388.53 ms /   639 tokens (   31.91 ms per token,    31.34 tokens per second)
llama_print_timings:        eval time = 14067.23 ms /   213 runs   (   66.04 ms per token,    15.14 tokens per second)
llama_print_timings:       total time = 34914.72 ms
Llama.gene

  Thank you for providing additional context! Based on the updated information, the answer to the original query remains the same: JimMadge is assigned to work on the Reginald force multiplier project. However, with the new context, it's important to note that the project involves collaboration between the REG team and the TPS delivery team, and requires coordination across multiple teams within the Turing. The project aims to establish REG and the Turing as leading experts on RIRs and provide guidance or templates to help teams like REG internationally.
The project will require approximately 2 FTE months of work, with an estimated time required of 2 months due to the need to coordinate with multiple teams within the Turing. The resources required for the project include JimMadge (as the REG-TPS liaison) and members of the TPS delivery team and RIR researchers.
Thank you for providing more information! If there's anything else I can help with, please feel free to ask.



llama_print_timings:        load time = 13161.57 ms
llama_print_timings:      sample time =   152.89 ms /   216 runs   (    0.71 ms per token,  1412.82 tokens per second)
llama_print_timings: prompt eval time = 11606.51 ms /   442 tokens (   26.26 ms per token,    38.08 tokens per second)
llama_print_timings:        eval time = 13179.79 ms /   215 runs   (   61.30 ms per token,    16.31 tokens per second)
llama_print_timings:       total time = 25237.39 ms


In [14]:
response = query_engine.query("When was hack week?")
print(response.response)

Llama.generate: prefix-match hit

llama_print_timings:        load time = 13161.57 ms
llama_print_timings:      sample time =    24.78 ms /    35 runs   (    0.71 ms per token,  1412.49 tokens per second)
llama_print_timings: prompt eval time = 19920.81 ms /   623 tokens (   31.98 ms per token,    31.27 tokens per second)
llama_print_timings:        eval time =  2062.85 ms /    34 runs   (   60.67 ms per token,    16.48 tokens per second)
llama_print_timings:       total time = 22057.70 ms
Llama.generate: prefix-match hit


  Thank you for providing additional context! Based on the updated information, the answer to the original query "When was hack week?" remains the same: Hack Week is scheduled to take place from May 12th to May 16th, 2023.



llama_print_timings:        load time = 13161.57 ms
llama_print_timings:      sample time =    41.09 ms /    58 runs   (    0.71 ms per token,  1411.40 tokens per second)
llama_print_timings: prompt eval time = 20685.67 ms /   691 tokens (   29.94 ms per token,    33.40 tokens per second)
llama_print_timings:        eval time =  3456.09 ms /    57 runs   (   60.63 ms per token,    16.49 tokens per second)
llama_print_timings:       total time = 24259.49 ms


In [15]:
response.source_nodes[0]

NodeWithScore(node=TextNode(id_='dbcf48f3-b173-4787-88c9-8bba2504ee5b', embedding=None, metadata={'state': 'open', 'created_at': '2023-01-06T11:37:16Z', 'url': 'https://api.github.com/repos/alan-turing-institute/Hut23/issues/1307', 'source': 'https://github.com/alan-turing-institute/Hut23/issues/1307', 'labels': []}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='1307', node_type=None, metadata={'state': 'open', 'created_at': '2023-01-06T11:37:16Z', 'url': 'https://api.github.com/repos/alan-turing-institute/Hut23/issues/1307', 'source': 'https://github.com/alan-turing-institute/Hut23/issues/1307', 'labels': []}, hash='0015303083b47352e82d62936b27054a4473b1f56406be024f4aa0af991f1588')}, hash='66a2aa4cf292e3e2155ee5238556063f2fd98cc818d33b123c4f7cf3f6e9cee5', text="Hack week 12-16 June 2023\n## Hack week timeline\r\n\r\nImportant dates | \xa0\r\n-- | --\r\n12 May | Deadline for project proposals\r\n1

In [16]:
chat_engine = index.as_chat_engine(chat_mode="context", system_prompt=system_prompt)

In [17]:
response = chat_engine.chat("When was hack week?")
print(response.response)

Llama.generate: prefix-match hit


  Hack Week is a recurring event at Google, where employees are given a week off to work on side projects or personal projects. It is typically held in the spring and fall of each year, and is an opportunity for Googlers to explore new ideas, collaborate with colleagues, and showcase their creations to the company.
Some specific dates for Hack Week include:
* Spring Hack Week: usually takes place in late March or early April
* Fall Hack Week: usually takes place in late September or early October
Please note that these dates are subject to change, and may vary depending on Google's internal planning and scheduling.



llama_print_timings:        load time = 13161.57 ms
llama_print_timings:      sample time =    98.72 ms /   139 runs   (    0.71 ms per token,  1408.04 tokens per second)
llama_print_timings: prompt eval time =   570.33 ms /    12 tokens (   47.53 ms per token,    21.04 tokens per second)
llama_print_timings:        eval time =  7300.44 ms /   138 runs   (   52.90 ms per token,    18.90 tokens per second)
llama_print_timings:       total time =  8155.91 ms


In [18]:
response = chat_engine.chat("When was hack week?")
print(response.response)

Llama.generate: prefix-match hit


  Based on the information provided in the context, Hack Week is scheduled to take place from June 12th to June 16th, 2023.



llama_print_timings:        load time = 13161.57 ms
llama_print_timings:      sample time =    26.78 ms /    38 runs   (    0.70 ms per token,  1418.86 tokens per second)
llama_print_timings: prompt eval time = 40089.27 ms /  1330 tokens (   30.14 ms per token,    33.18 tokens per second)
llama_print_timings:        eval time =  2489.51 ms /    37 runs   (   67.28 ms per token,    14.86 tokens per second)
llama_print_timings:       total time = 42659.01 ms
