In [None]:
#https://medium.com/@csakash03/evaluating-rag-with-llamaindex-3f74a35c53fa

#another link for eval
#https://docs.llamaindex.ai/en/stable/examples/node_postprocessor/MetadataReplacementDemo.html

In [28]:
import random
import nest_asyncio
from llama_index.core import VectorStoreIndex, ServiceContext, SimpleDirectoryReader, Document
from llama_index.llms.llama_cpp import LlamaCPP
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.core.node_parser import SentenceWindowNodeParser, SentenceSplitter
from llama_index.core.indices.postprocessor import SentenceTransformerRerank, MetadataReplacementPostProcessor
from llama_index.core.evaluation import DatasetGenerator, CorrectnessEvaluator, RelevancyEvaluator, FaithfulnessEvaluator
from llama_index.core.evaluation.eval_utils import get_responses, get_results_df
from llama_index.core.evaluation import BatchEvalRunner

In [29]:

# Initialize LLAMA model and components
llm = LlamaCPP(
    model_url='https://huggingface.co/TheBloke/Mistral-7B-Instruct-v0.2-GGUF/resolve/main/mistral-7b-instruct-v0.2.Q4_K_M.gguf',
    temperature=1,
    model_kwargs={"n_gpu_layers": -1},
    verbose=True
)
embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-small-en-v1.5")
rerank = SentenceTransformerRerank(top_n=2, model="BAAI/bge-reranker-base")
postproc = MetadataReplacementPostProcessor(target_metadata_key="window")


llama_model_loader: loaded meta data with 24 key-value pairs and 291 tensors from /Users/yasaminabbaszadegan/Library/Caches/llama_index/models/mistral-7b-instruct-v0.2.Q4_K_M.gguf (version GGUF V3 (latest))
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = llama
llama_model_loader: - kv   1:                               general.name str              = mistralai_mistral-7b-instruct-v0.2
llama_model_loader: - kv   2:                       llama.context_length u32              = 32768
llama_model_loader: - kv   3:                     llama.embedding_length u32              = 4096
llama_model_loader: - kv   4:                          llama.block_count u32              = 32
llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
llama_model_loader: - kv   6:                 llama.rope.dimension_count u32      

In [23]:

# Create node parser and postprocessors
node_parser = SentenceWindowNodeParser.from_defaults(
    window_size=5,
    window_metadata_key="window",
    original_text_metadata_key="original_text"
)
postprocessors = [postproc, rerank]

# Load documents and build sentence index
docs = SimpleDirectoryReader(input_files=['./temp/4583673.pdf']).load_data()
documents = Document(text="\n\n".join([doc.text for doc in docs]))
sentence_context = ServiceContext.from_defaults(llm=llm, embed_model=embed_model, node_parser=node_parser)
sentence_index = VectorStoreIndex.from_documents([documents], service_context=sentence_context)

# Setup evaluation dataset
num_nodes_eval = 2
base_nodes = SentenceSplitter().get_nodes_from_documents(docs)
sample_eval_nodes = random.sample(base_nodes, num_nodes_eval)
dataset_generator = DatasetGenerator(sample_eval_nodes, llm=llm, show_progress=True, num_questions_per_chunk=4)
eval_dataset = await dataset_generator.agenerate_dataset_from_nodes()

# Save evaluation dataset
eval_dataset.save_json("./eval_dataset/qr_dataset.json")

  dataset_generator = DatasetGenerator(
  0%|          | 0/2 [00:00<?, ?it/s]
llama_print_timings:        load time =    5912.42 ms
llama_print_timings:      sample time =       9.63 ms /   107 runs   (    0.09 ms per token, 11106.50 tokens per second)
llama_print_timings: prompt eval time =    6414.72 ms /   767 tokens (    8.36 ms per token,   119.57 tokens per second)
llama_print_timings:        eval time =    2031.34 ms /   106 runs   (   19.16 ms per token,    52.18 tokens per second)
llama_print_timings:       total time =    8588.95 ms /   873 tokens
Llama.generate: prefix-match hit

llama_print_timings:        load time =    5912.42 ms
llama_print_timings:      sample time =      13.11 ms /   109 runs   (    0.12 ms per token,  8316.17 tokens per second)
llama_print_timings: prompt eval time =    1213.73 ms /   636 tokens (    1.91 ms per token,   524.01 tokens per second)
llama_print_timings:        eval time =    2078.65 ms /   108 runs   (   19.25 ms per token,    51.96 toke

In [24]:
eval_dataset.dict()

{'queries': {'87ce6719-5d76-4acb-a099-860077ccb20e': 'Who are the property owners of PLAN 254E PT LOTS 23 & 24, as indicated by the document?',
  '861f10f6-ea84-4465-b65b-842fab947ccd': 'What is the last date for appealing this decision to the Ontario Municipal Board, as stated in the document?',
  'a365db2b-ca96-4d00-85d6-831f1073cb8a': 'What is the filing fee required to appeal this decision to the Ontario Municipal Board, as stated in the document?',
  '7074585b-f607-4881-b467-10756f8fdb84': 'What is the legal description and community of the property address 91 Leuty Ave, according to the document?',
  'c1c5b476-5edc-4361-a9b5-0275ae3bc085': 'What are the two requirements mentioned in the document that would not be met by the proposed rear deck addition?',
  '6908c8a9-0c41-41a0-b650-ccc7f4a825a4': 'Which two sections of By-law 438 -86 are referenced in the document in regards to parking and access requirements?',
  'e6d660b9-2790-4a8f-9071-feaf698960b4': 'What are the two dimension

In [7]:
eval_dataset.save_json("./eval_dataset/qr_dataset.json")
# optional
# eval_dataset = QueryResponseDataset.from_json("data/ipcc_eval_qr_dataset.json")



In [25]:


# Setup evaluators
evaluator_c = CorrectnessEvaluator(llm=llm)
evaluator_r = RelevancyEvaluator(llm=llm)
evaluator_f = FaithfulnessEvaluator(llm=llm)

# Setup query engines
base_query_engine = sentence_index.as_query_engine(similarity_top_k=6)
query_engine = sentence_index.as_query_engine(
    similarity_top_k=6,
    node_postprocessors=[
        MetadataReplacementPostProcessor(target_metadata_key="window"),
        SentenceTransformerRerank(top_n=2, model="BAAI/bge-reranker-base")
    ]
)



In [26]:
from llama_index.core.evaluation.eval_utils import (
    get_responses,
    get_results_df,
)
from llama_index.core.evaluation import BatchEvalRunner

max_samples = 30

eval_qs = eval_dataset.questions
ref_response_strs = [r for (_, r) in eval_dataset.qr_pairs]

# resetup base query engine and sentence window query engine
# base query engine
node_parser = SentenceSplitter()
documents = SimpleDirectoryReader(input_files=['./temp/4583673.pdf']).load_data()
documents = Document(text="\n\n".join([doc.text for doc in documents]))
base_index = VectorStoreIndex.from_documents([documents], service_context=sentence_context)
base_query_engine = base_index.as_query_engine(similarity_top_k=6)
# sentence window query engine

query_engine = sentence_index.as_query_engine(
    similarity_top_k=6,
    # the target key defaults to `window` to match the node_parser's default
    node_postprocessors=[
        MetadataReplacementPostProcessor(target_metadata_key="window"),SentenceTransformerRerank(top_n=2, model="BAAI/bge-reranker-base")
    ],
)
import numpy as np

base_pred_responses = get_responses(
    eval_qs[:max_samples], base_query_engine, show_progress=True
)
pred_responses = get_responses(
    eval_qs[:max_samples], query_engine, show_progress=True
)




pred_response_strs = [str(p) for p in pred_responses]
base_pred_response_strs = [str(p) for p in base_pred_responses]


  0%|          | 0/8 [00:00<?, ?it/s]Llama.generate: prefix-match hit

llama_print_timings:        load time =    5912.42 ms
llama_print_timings:      sample time =      25.67 ms /   254 runs   (    0.10 ms per token,  9895.98 tokens per second)
llama_print_timings: prompt eval time =    2231.75 ms /   298 tokens (    7.49 ms per token,   133.53 tokens per second)
llama_print_timings:        eval time =    4714.45 ms /   253 runs   (   18.63 ms per token,    53.66 tokens per second)
llama_print_timings:       total time =    7367.71 ms /   551 tokens
Llama.generate: prefix-match hit

llama_print_timings:        load time =    5912.42 ms
llama_print_timings:      sample time =      11.82 ms /    96 runs   (    0.12 ms per token,  8119.08 tokens per second)
llama_print_timings: prompt eval time =    1531.18 ms /   826 tokens (    1.85 ms per token,   539.45 tokens per second)
llama_print_timings:        eval time =    1854.58 ms /    95 runs   (   19.52 ms per token,    51.22 tokens per 

In [27]:
# Get responses for evaluation
nest_asyncio.apply()

max_samples = 30
eval_qs = eval_dataset.questions
ref_response_strs = [r for (_, r) in eval_dataset.qr_pairs]
base_pred_responses = get_responses(eval_qs[:max_samples], base_query_engine, show_progress=True)
pred_responses = get_responses(eval_qs[:max_samples], query_engine, show_progress=True)

# Evaluate responses
evaluator_dict = {
    "correctness": evaluator_c, #Correctness: Compares the generated answer against the ground-truth answer.
    "faithfulness": evaluator_f, #Faithfulness: Evaluates whether a response is faithful to the contexts (label-free)
    "relevancy": evaluator_r, #to measure if the response + source nodes match the query.This is useful for measuring if the query was actually answered by the response
}
batch_runner = BatchEvalRunner(evaluator_dict, workers=2, show_progress=True)
eval_results = await batch_runner.aevaluate_responses(
    queries=eval_qs[:max_samples],
    responses=pred_responses[:max_samples],
    reference=ref_response_strs[:max_samples]
)
base_eval_results = await batch_runner.aevaluate_responses(
    queries=eval_qs[:max_samples],
    responses=base_pred_responses[:max_samples],
    reference=ref_response_strs[:max_samples]
)

# Display results
results_df = get_results_df(
    [eval_results, base_eval_results],
    ["Sentence Window Retriever", "Base Retriever"],
    ["correctness", "relevancy", "faithfulness"]
)
display(results_df)

  0%|          | 0/24 [00:00<?, ?it/s]Llama.generate: prefix-match hit

llama_print_timings:        load time =    5912.42 ms
llama_print_timings:      sample time =       4.78 ms /    36 runs   (    0.13 ms per token,  7532.96 tokens per second)
llama_print_timings: prompt eval time =    2521.12 ms /  1296 tokens (    1.95 ms per token,   514.06 tokens per second)
llama_print_timings:        eval time =     732.70 ms /    35 runs   (   20.93 ms per token,    47.77 tokens per second)
llama_print_timings:       total time =    3332.59 ms /  1331 tokens
Llama.generate: prefix-match hit

llama_print_timings:        load time =    5912.42 ms
llama_print_timings:      sample time =      13.91 ms /    79 runs   (    0.18 ms per token,  5681.41 tokens per second)
llama_print_timings: prompt eval time =    1795.39 ms /   934 tokens (    1.92 ms per token,   520.22 tokens per second)
llama_print_timings:        eval time =    1580.93 ms /    78 runs   (   20.27 ms per token,    49.34 tokens per

Unnamed: 0,names,correctness,relevancy,faithfulness
0,Sentence Window Retriever,5.0,1.0,0.75
1,Base Retriever,4.875,0.875,0.875
