In [None]:
from llama_index import (
    SimpleDirectoryReader,
    LangchainEmbedding,
    GPTListIndex,
    GPTVectorStoreIndex,
    PromptHelper,
    LLMPredictor,
    ServiceContext,
    Document
)
from langchain.embeddings.huggingface import HuggingFaceEmbeddings
from langchain.llms.base import LLM
from langchain.chat_models import ChatOpenAI

import pandas as pd
import torch
import transformers
from transformers import pipeline, AutoModel, AutoModelForCausalLM, AutoTokenizer
import accelerate

from tqdm.notebook import tqdm

In [None]:
import logging
logging.getLogger().setLevel(logging.CRITICAL)

In [2]:
# test = CustomLLM(model_name=model_name,
#                  pipeline=distilgpt2_pipeline)

In [3]:
wiki = pd.read_csv("../../data/wiki-scraped.csv")
wiki["body"] = wiki["body"].astype("str")
wiki.head()

Unnamed: 0.1,Unnamed: 0,url,id,title,is_public,body,summary,author,keywords
0,0,https://github.com/alan-turing-institute/resea...,,wrong,False,Information and tips and tricks on working fro...,Information and tips and tricks on working fro...,,working Teams and for screen using Wellbeing c...
1,1,https://github.com/alan-turing-institute/resea...,,wrong,False,REG Buddy Sign Up Sheet \nThis page is for org...,REG Buddy Sign Up Sheet \nThis page is for org...,,Sheet REG Up Buddy Sign Matches
2,2,https://github.com/alan-turing-institute/resea...,,wrong,False,Acknowledging Turing Funding \nWriting tips \n...,Acknowledging Turing Funding \nWriting tips \n...,,Turing Funding Books Writing Acknowledging Blo...
3,3,https://github.com/alan-turing-institute/resea...,,wrong,False,Next Meeting \nLightning talk sessions are hel...,Next Meeting \nLightning talk sessions are hel...,,Signup List Meeting Next
4,4,https://github.com/alan-turing-institute/resea...,,wrong,False,"Wellbeing \n\nWellbeing Events \n\nEquality, D...","Wellbeing \n\nWellbeing Events \n\nEquality, D...",,"and Inclusion Wellbeing Diversity Equality,"


In [4]:
handbook = pd.read_csv("../../data/handbook-scraped.csv")
handbook["body"] = handbook["body"].astype("str")
handbook.head()

Unnamed: 0.1,Unnamed: 0,url,id,title,is_public,body,summary,author,keywords
0,0,https://alan-turing-institute.github.io/REG-ha...,,Docs,True,,,,
1,1,https://alan-turing-institute.github.io/REG-ha...,,Contributing,True,Contributing This section contains a guide for...,Contributing This section contains a guide for...,,Contributing
2,2,https://alan-turing-institute.github.io/REG-ha...,,Advanced,True,Advanced Using Data Through using templates Hu...,Advanced Using Data Through using templates Hu...,,Partial Using Shortcodes Creating Advanced Dat...
3,3,https://alan-turing-institute.github.io/REG-ha...,,Recognising Contributions,True,Recognising Contributions We aim to recognise ...,Recognising Contributions We aim to recognise ...,,Contributions Recognising
4,4,https://alan-turing-institute.github.io/REG-ha...,,Editing a Page,True,Editing a Page If you followed the instruction...,Editing a Page If you followed the instruction...,,Editing Hugo Theme Content Shortcodes a Matter...


In [5]:
text_list = list(wiki["body"]) + list(handbook["body"])
documents = [Document(t) for t in text_list]

In [None]:
hfemb = HuggingFaceEmbeddings()
embed_model = LangchainEmbedding(hfemb)

- Are documents cut up? How are they split up?

## Using OpenAI API

(First set up your `OPENAI_API_KEY` environment variable!)

In [6]:
# set number of output tokens
num_output = 512
# set maximum input size
max_input_size = 4096
# set maximum chunk overlap
max_chunk_overlap = 20
chunk_size_limit = 600

prompt_helper = PromptHelper(context_window=max_input_size,
                             num_output=num_output,
                             chunk_size_limit=chunk_size_limit,
                             max_chunk_overlap=max_chunk_overlap)

In [7]:
llm_predictor = LLMPredictor(llm=ChatOpenAI(temperature=0.7,
                                            model="gpt-3.5-turbo",
                                            max_tokens=num_output))

In [25]:
service_context = ServiceContext.from_defaults(llm_predictor=llm_predictor,
                                               embed_model=embed_model,
                                               prompt_helper=prompt_helper)
index = GPTVectorStoreIndex.from_documents(documents,
                                           service_context=service_context)
query_engine = index.as_query_engine()

In [27]:
response = query_engine.query("What are the events that REG run?")

In [28]:
response.response

'REG runs Tech Talks and Reading and Discussion Groups, as well as having monthly all-REG meetings. They also participate in Turing Events such as Catchups & Town Halls and Hack Week.'

In [20]:
print(response.response)

REG runs several regular events, including Humanities & Data Science discussion group, Bayesian reading group, and Category Theory Reading Group. Information on these groups can be found in The REGistry. REG also has a non-work based Bookclub that runs as well. Additionally, roughly once a month, REG has an all-REG meeting where new joiners are welcomed and news from around REG or the Turing is presented. REG members are also encouraged to attend other events happening around the Turing, such as Turing Catchups & Town Halls, public talks, workshops, and the annual Hack Week.


In [29]:
response = query_engine.query("What reading groups do REG have?")

In [30]:
print(response.get_formatted_sources())

> Source (Doc id: 109bf3ad-0f30-4a49-b0ce-696ff5557c5e): Reading Groups As part of projects or for general interest, members of the team create reading gr...

> Source (Doc id: ab2d8495-df7d-41e2-82e6-3c6df5256158): This is a page for adding information and times about regular events that occur in REG and across...


In [31]:
print(response.response)

The new context provided is not directly related to the question asking about reading groups, so the original answer still stands: REG has several reading groups, including a Reinforcement Learning reading group, a Bitcoin reading group, a Humanities & Data Science discussion group, a Bayesian reading group, and a Category Theory Reading Group. The contact details for these groups are in The-REGistry.


In [33]:
response = query_engine.query("What are 22 days collaborations?")
print(response.response)

There is no mention of "22 days collaborations" in the given context information.


In [34]:
response = query_engine.query("what was my first query to you?")
print(response.response)

I'm sorry, but the context information provided does not mention any query or question being asked.


In [35]:
response = query_engine.query("What is the starting salary for a Senior RSE in REG?")
print(response.response)

Based on the given context, the starting salary for a Senior RSE in REG is within the salary range of £49,025 to £59,150 per year. However, the exact starting salary is not provided and may depend on factors such as the candidate's experience and qualifications. It is also important to note that on promotion, individuals will be appointed at the bottom of the salary range for the role.


In [36]:
response = query_engine.query("What is the medium salary for a Senior RSE in REG?")
print(response.response)

Based on the provided salary range for Senior RSEs in REG (which spans all of Band 4 and some of Band 5), and assuming that the Senior RSE is appointed at the bottom of the salary range for the role on promotion, the median salary for a Senior RSE in REG can be estimated to be around £49,025. However, it's important to note that this estimate is based on assumptions and the actual salary for a Senior RSE may vary depending on various factors.


In [37]:
response = query_engine.query("How do I get leadership to pay me more?")
print(response.response)

Based on the new context provided, it seems that the pay bands are standardized across the institute and do not distinguish between roles such as 3a and 3b, which makes it difficult to negotiate for a higher pay based on your specific role. However, it's still important to consistently perform at a high level and exceed expectations, as this could lead to opportunities for promotions and pay raises within the standardized pay bands. It may also be worth discussing your performance and potential for growth with your manager and HR to better understand how you can progress within the standardized pay bands.


## Using falcon-7b

In [17]:
model_name = "tiiuae/falcon-7b"
falcon_7b = pipeline(
    "text-generation",
    model=model_name,
    torch_dtype=torch.bfloat16,
    trust_remote_code=True,
)

class CustomLLM(LLM):
    model_name: str
    pipeline: transformers.pipelines.text_generation.TextGenerationPipeline
        
    @property
    def _llm_type(self) -> str:
        return "custom"
    
    def _call(self, prompt, stop=None):
        return self.pipeline(prompt, max_new_tokens=9999)[0]["generated_text"]
    
    @property
    def _identifying_params(self) -> dict:
        """Get the identifying parameters."""
        return {"model_name": self.model_name}
    
llm_predictor_falcon_7b = LLMPredictor(llm=CustomLLM(model_name=model_name,
                                                     pipeline=falcon_7b))

Explicitly passing a `revision` is encouraged when loading a configuration with custom code to ensure no malicious code has been contributed in a newer revision.
Explicitly passing a `revision` is encouraged when loading a configuration with custom code to ensure no malicious code has been contributed in a newer revision.
Explicitly passing a `revision` is encouraged when loading a model with custom code to ensure no malicious code has been contributed in a newer revision.
Loading checkpoint shards: 100%|███████████████████| 2/2 [00:20<00:00, 10.14s/it]
The model 'RWForCausalLM' is not supported for text-generation. Supported models are ['BartForCausalLM', 'BertLMHeadModel', 'BertGenerationDecoder', 'BigBirdForCausalLM', 'BigBirdPegasusForCausalLM', 'BioGptForCausalLM', 'BlenderbotForCausalLM', 'BlenderbotSmallForCausalLM', 'BloomForCausalLM', 'CamembertForCausalLM', 'CodeGenForCausalLM', 'CTRLLMHeadModel', 'Data2VecTextForCausalLM', 'ElectraForCausalLM', 'ErnieForCausalLM', 'GitForCau

For this model, we must have a lower `max_input_size` and smaller `chunk_size_limit`.

Let's load in the model and inspect the architecture (there might be a better way to do this...)

In [25]:
falcon = AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=True)

Explicitly passing a `revision` is encouraged when loading a configuration with custom code to ensure no malicious code has been contributed in a newer revision.
Explicitly passing a `revision` is encouraged when loading a model with custom code to ensure no malicious code has been contributed in a newer revision.
Loading checkpoint shards: 100%|███████████████████| 2/2 [00:19<00:00,  9.92s/it]


In [26]:
falcon

RWForCausalLM(
  (transformer): RWModel(
    (word_embeddings): Embedding(65024, 4544)
    (h): ModuleList(
      (0-31): 32 x DecoderLayer(
        (input_layernorm): LayerNorm((4544,), eps=1e-05, elementwise_affine=True)
        (self_attention): Attention(
          (maybe_rotary): RotaryEmbedding()
          (query_key_value): Linear(in_features=4544, out_features=4672, bias=False)
          (dense): Linear(in_features=4544, out_features=4544, bias=False)
          (attention_dropout): Dropout(p=0.0, inplace=False)
        )
        (mlp): MLP(
          (dense_h_to_4h): Linear(in_features=4544, out_features=18176, bias=False)
          (act): GELU(approximate='none')
          (dense_4h_to_h): Linear(in_features=18176, out_features=4544, bias=False)
        )
      )
    )
    (ln_f): LayerNorm((4544,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=4544, out_features=65024, bias=False)
)

Looks like context length is 4544 (is this right?). Let's set `max_input_size` to this.

In [None]:
# set number of output tokens
num_output = 512
# set maximum input size
max_input_size = 4544
# set maximum chunk overlap
max_chunk_overlap = 20
chunk_size_limit = 600

prompt_helper = PromptHelper(context_window=max_input_size,
                             num_output=num_output,
                             chunk_size_limit=chunk_size_limit,
                             max_chunk_overlap=max_chunk_overlap)

In [None]:
service_context = ServiceContext.from_defaults(llm_predictor=llm_predictor_falcon_7b,
                                               embed_model=embed_model,
                                               prompt_helper=prompt_helper)
index = GPTVectorStoreIndex.from_documents(documents,
                                           service_context=service_context)
query_engine_falcon_7b = index.as_query_engine()

In [None]:
response = query_engine_falcon_7b.query("What is the starting salary for a Senior RSE in REG?")
print(response.response)

In [None]:
response = query_engine_falcon_7b.query("How much budget do we have for professional development activities?")
print(response.response)