# Questions and Answers using Llama 2

Model: Llama 2 7b-chat

Data: Handbook test data set

In [1]:
from reginald.models.setup_llm import setup_llm

from reginald.models.models.llama_index import (
    setup_settings,
    LlamaIndexLlamaCPP,
    set_global_tokenizer,
    compute_default_chunk_size,
)





In [2]:
from llama_index.core.evaluation import DatasetGenerator, RelevancyEvaluator
from llama_index.core import SimpleDirectoryReader, VectorStoreIndex, Response
from llama_index.llms.openai import OpenAI
from llama_index.llms.ollama import Ollama

from llama_index.core import (
    StorageContext,
    load_index_from_storage,
)

from transformers import AutoTokenizer
import nest_asyncio

nest_asyncio.apply()

In [3]:
# Use ollama
ollama_llm = Ollama(model="llama2:7b-chat", request_timeout=60.0)

In [4]:
chunk_size = compute_default_chunk_size(
    max_input_size=4096, k=3
)  # calculate chunk size

In [5]:
tokenizer = AutoTokenizer.from_pretrained(
    "meta-llama/Llama-2-7b-chat-hf"
).encode  # load tokenizer
set_global_tokenizer(tokenizer)

In [6]:
settings = setup_settings(
    llm=ollama_llm,
    max_input_size=4096,
    num_output=512,
    chunk_overlap_ratio=0.1,
    chunk_size=chunk_size,
    k=3,
    tokenizer=tokenizer,
)  # these are settings for the storage context

## Load in the Data

In [7]:
# To load in the handbook

storage_context = StorageContext.from_defaults(
    persist_dir="../../data/llama_index_indices/handbook/"
)

vector_index = load_index_from_storage(
    storage_context=storage_context,
    settings=settings,
)  # load the data index from storage

In [8]:
# To load in the example Paul Graham data
# reader = SimpleDirectoryReader("./data/paul_graham/")
# documents = reader.load_data()
# vector_index = VectorStoreIndex.from_documents(documents)

In [9]:
dataset_generator = DatasetGenerator.from_documents(
    vector_index.docstore.docs.values(),
    num_questions_per_chunk=1,
    show_progress=True,
    llm=ollama_llm
)

Parsing nodes:   0%|          | 0/39 [00:00<?, ?it/s]

  return cls(


## Generate Questions

In [None]:
# eval_questions = dataset_generator.generate_questions_from_nodes(5)  # generate questions from the documents

In [None]:
# eval_questions

In [10]:
# load in the jsonl file
import json
import pandas as pd
import jsonlines


file_path = '/Users/kgoldmann/Documents/Projects/reginald/data/handbook_qa/data/qAndA.json'

def load_multiline_jsonl(file_path):
    objects = []
    buffer = ""
    with open(file_path, 'r') as file:
        for line in file:
            buffer += line
            try:
                obj = json.loads(buffer)
                objects.append(obj)
                buffer = ""  # Reset the buffer after a successful load
            except json.JSONDecodeError:
                # If a JSONDecodeError occurs, it means the current buffer is not a complete JSON object yet
                continue
    return objects


json_objects = load_multiline_jsonl(file_path)

{'prompt': 'What is a page in the handbook made of',
 'completion': 'A page in the handbook is made of a YAML front matter section followed by the page contents in Markdown. The front matter contains keys such as `title` and `weight`, which define metadata about the page. The content section is formatted in Markdown and can contain built-in or custom shortcodes.'}

In [15]:
eval_questions = [x['prompt'] for x in json_objects]
gpt3_answers = [x['completion'] for x in json_objects]

eval_questions = eval_questions[:10]
gpt3_answers = gpt3_answers[:10]

## Answer the Questions with Llama 2

In [13]:
evaluator_gpt4 = RelevancyEvaluator(llm=ollama_llm)

In [None]:
# for every value in eval_questions, get the response from the llm
all_dfs = []

for i in range(len(eval_questions)):
    query_engine = vector_index.as_query_engine()
    response_vector = query_engine.query(eval_questions[i])
    eval_result = evaluator_gpt4.evaluate_response(
        query=eval_questions[i], response=response_vector
    )

    # append to the all_dfs list
    all_dfs.append({'eval_result': eval_result,
                    'question': eval_questions[i],
                    'response': response_vector})


In [None]:
# extract the different elements
questions = [x['question'] for x in all_dfs]
response = [x['response'] for x in all_dfs]
sources = [x['response'].source_nodes for x in all_dfs]
#sources = [x[0].node.get_content() for x in sources]

# for the each sources, get the content as one string deliminated by ;
sources = [";".join([x.get_content() for x in y]) for y in sources]

match = [x['eval_result'].passing for x in all_dfs] # does the response match the source

In [None]:
# extract the questions
combined_df = pd.DataFrame({'questions': questions, 'response': response, 'sources': sources, 'match': match})
combined_df.to_csv("data/handbook_QandA.csv", index=False)

In [None]:
combined_df_out = combined_df.style.set_properties(
        **{
            "inline-size": "600px",
            "overflow-wrap": "break-word",
        }
    )
display(combined_df_out)

## Ask Reginal the Questions

In [16]:
response_model = setup_llm(
    model="llama-index-llama-cpp",
    model_name="../../../llama-2-7b-chat.Q4_K_M.gguf",
    data_dir="../../data/",
    which_index="handbook",
)

llama_model_loader: loaded meta data with 19 key-value pairs and 291 tensors from /Users/kgoldmann/Library/Caches/llama_index/models/llama-2-7b-chat.Q4_K_M.gguf (version GGUF V2)
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = llama
llama_model_loader: - kv   1:                               general.name str              = LLaMA v2
llama_model_loader: - kv   2:                       llama.context_length u32              = 4096
llama_model_loader: - kv   3:                     llama.embedding_length u32              = 4096
llama_model_loader: - kv   4:                          llama.block_count u32              = 32
llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 11008
llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
llama_model_loader: - kv   7:            

In [17]:
def get_response(question):
    resp = response_model.direct_message(message=que, user_id="")
    return(resp.message)

In [18]:
all_questions = eval_questions #combined_df['questions'].tolist()
reg_responses = []
for que in all_questions:
        reg_responses.append(get_response(que))


llama_print_timings:        load time =    6460.86 ms
llama_print_timings:      sample time =      27.92 ms /   300 runs   (    0.09 ms per token, 10743.45 tokens per second)
llama_print_timings: prompt eval time =   20115.94 ms /  1524 tokens (   13.20 ms per token,    75.76 tokens per second)
llama_print_timings:        eval time =   20862.33 ms /   299 runs   (   69.77 ms per token,    14.33 tokens per second)
llama_print_timings:       total time =   41499.15 ms /  1823 tokens
Llama.generate: prefix-match hit

llama_print_timings:        load time =    6460.86 ms
llama_print_timings:      sample time =      38.89 ms /   405 runs   (    0.10 ms per token, 10412.92 tokens per second)
llama_print_timings: prompt eval time =    8871.27 ms /   560 tokens (   15.84 ms per token,    63.13 tokens per second)
llama_print_timings:        eval time =   28696.71 ms /   404 runs   (   71.03 ms per token,    14.08 tokens per second)
llama_print_timings:       total time =   38295.25 ms /   964 

In [19]:
reg_responses

['  A page in the REG Handbook is made up of several components:\n1. Front Matter: The front matter is the top section of a content file and contains metadata about the page, such as its title, weight, and other information. It is defined using YAML and is written at the top of each page.\n2. Content Section: The content section is the main section of the page and contains the actual content of the page. It is written in Markdown format and can include text, images, tables, and other elements.\n3. Shortcodes: Shortcodes are predefined templates that can be used to include more complex features or content in a page. They can be called using the `{{% shortcode }}` syntax and can be particularly useful for including things like figures, gists, highlights, and references.\n4. Theme Shortcodes: The theme has its own set of shortcodes that can be used to include additional functionality or content in a page. These shortcodes are typically defined in the theme\'s README file.\n5. Repository S

## Evaluate the Answers

In [20]:
from llama_index.core.evaluation import CorrectnessEvaluator

In [None]:
# evaluator_gpt4 = RelevancyEvaluator(llm=ollama_llm)

### Correctness Evaluation

In [21]:
evaluator = CorrectnessEvaluator(llm=ollama_llm)

In [22]:
all_questions

['What is a page in the handbook made of',
 'What is the purpose of front matter',
 'What are the keys most commonly used in front matter',
 'What does the weight key do',
 'Where can you find the predefined front matter keys for the handbook theme',
 'What is the content section of a page in the handbook',
 'What is Markdown',
 'What are shortcodes in the handbook',
 'What is the difference between using a shortcode and raw HTML in a content file',
 'What are some of the built-in shortcodes in Hugo']

In [28]:
outputs = []

for i in range(len(all_questions)):
    print(i)
    query = all_questions[i]

    reference = gpt3_answers[i]

    response = reg_responses[i]

    result = evaluator.evaluate(
        query=query,
        response=response,
        contexts=[reference],
    )

    outputs.append(result)

0
1
2
3
4
5
6
7
8
9


In [29]:
print(len(reg_responses), len(all_questions), len(gpt3_answers), len(outputs))

10 10 10 10


In [30]:
outputs_df= pd.DataFrame({
    'question': all_questions,
    'reg_response': reg_responses,
    'gpt_response':gpt3_answers,
    'correct': [x.passing for x in outputs]
})

In [31]:
outputs_df

Unnamed: 0,question,reg_response,gpt_response,correct
0,What is a page in the handbook made of,A page in the REG Handbook is made up of sev...,A page in the handbook is made of a YAML front...,True
1,What is the purpose of front matter,The purpose of front matter in a book or doc...,The purpose of front matter in a Markdown file...,True
2,What are the keys most commonly used in front ...,The keys or elements most commonly used in f...,The most commonly used keys in front matter ar...,
3,What does the weight key do,In the context of front matter in a book or ...,The weight key in the front matter of a page d...,True
4,Where can you find the predefined front matter...,The Handbook theme for Hugo provides several...,The predefined front matter keys for the handb...,True
5,What is the content section of a page in the h...,"In the Handbook theme for Hugo, the ""content...",The content section of a page in the handbook ...,True
6,What is Markdown,Markdown is a lightweight markup language th...,Markdown is a formatting language used to crea...,
7,What are shortcodes in the handbook,"In the Handbook theme for Hugo, shortcodes a...",Shortcodes are templates that can be parameter...,True
8,What is the difference between using a shortco...,Shortcodes and raw HTML are two different wa...,Using a shortcode is preferable to including r...,True
9,What are some of the built-in shortcodes in Hugo,Hugo comes with several built-in shortcodes ...,Hugo has a set of built-in shortcodes includin...,True


In [32]:
outputs_df.to_csv("data/handbook_QandA_correctness.csv", index=False)