In [1]:
from datasets import load_dataset

ds = load_dataset("explodinggradients/WikiEval")

In [2]:
from tqdm import tqdm
import nest_asyncio
from tqdm.asyncio import tqdm_asyncio

nest_asyncio.apply()

In [3]:
import os 
api_key = "your-api-key"
os.environ["OPENAI_API_KEY"] = api_key

In [4]:
from chat_handle import ChatHandler

In [5]:
chatHandler = ChatHandler(index_name="eval-data-mining-embeddings-test")

In [6]:
ds['train'][0]#['question'][10:]

{'answer': 'Answer: The PSLV-C56 mission is scheduled to be launched on Sunday, 30 July 2023 at 06:30 IST / 01:00 UTC. It will be launched from the Satish Dhawan Space Centre, Sriharikota, Andhra Pradesh, India.',
 'question': 'Question: When is the scheduled launch date and time for the PSLV-C56 mission, and where will it be launched from?',
 'context_v1': ["The PSLV-C56 is the 58th mission of Indian Space Research Organisation's Polar Satellite Launch Vehicle (PSLV) and the 17th flight of the PSLV-CA variant, and will be get launched from Satish Dhawan Space Centre First Launch Pad ( FLP ).\n\nLaunch\nIt is Scheduled to get launched on Sunday, 30 July 2023 at 06:30 IST / 01:00 UTC from Satish Dhawan Space Centre, Sriharikota, Andhra Pradesh, India. This is a dedicated commercial mission through NSIL with DS-SAR as primary satellite and VELOX-AM as a co-passenger satellite With other 5 Satellites, All satellites from this mission belongs to Singapore."],
 'context_v2': ["The PSLV-C56 

In [7]:
# for i in range(50):
#     chatHandler.add_text(ds['train'][i]['context_v1'][0])
#     chatHandler.add_text(ds['train'][i]['context_v2'][0])

In [8]:
chat_engine = chatHandler.chat_engine

In [9]:
chat_responses = []
for i in tqdm(range(50)):
    chat_responses.append(chat_engine.chat(ds['train'][i]['question'][10:]))

100%|██████████| 50/50 [07:07<00:00,  8.55s/it]


In [10]:
# response = chat_engine.chat("When is the scheduled launch date and time for the PSLV-C56 mission, and where will it be launched from?")

In [11]:
# response.source_nodes[0].text

In [12]:
# instantiate the gpt-4o-mini judges
from llama_index.llms.openai import OpenAI
from llama_index.core.evaluation import (
    AnswerRelevancyEvaluator,
    ContextRelevancyEvaluator,
)

judges = {}

judges["answer_relevancy"] = AnswerRelevancyEvaluator(
    llm=OpenAI(temperature=0, model="gpt-4o-mini", api_key=api_key),
)

judges["context_relevancy"] = ContextRelevancyEvaluator(
    llm=OpenAI(temperature=0, model="gpt-4o-mini", api_key=api_key),
)

In [13]:
eval_tasks = []
for example, prediction in zip(
    ds['train'], chat_responses
):
    # print('query: ', example['question'][10:])
    # print('response: ', prediction.response)
    # print('context: ', [x.text for x in prediction.source_nodes])
    eval_tasks.append(
        judges["answer_relevancy"].aevaluate(
            query=example['question'][10:],
            response=prediction.response,
            sleep_time_in_seconds=1.0,
        )
    )
    eval_tasks.append(
        judges["context_relevancy"].aevaluate(
            query=example['question'][10:],
            contexts=[x.text for x in prediction.source_nodes],
            sleep_time_in_seconds=1.0,
        )
    )

In [14]:
eval_results1 = await tqdm_asyncio.gather(*eval_tasks[:50])

100%|██████████| 50/50 [00:13<00:00,  3.60it/s]


In [15]:
eval_results2 = await tqdm_asyncio.gather(*eval_tasks[50:])

100%|██████████| 50/50 [00:11<00:00,  4.26it/s]


In [16]:
eval_results = eval_results1 + eval_results2

In [17]:
evals = {
    "answer_relevancy": eval_results[::2],
    "context_relevancy": eval_results[1::2],
}

In [18]:
from llama_index.core.evaluation.notebook_utils import get_eval_results_df
import pandas as pd

deep_dfs = {}
mean_dfs = {}
for metric in evals.keys():
    deep_df, mean_df = get_eval_results_df(
        names=["baseline"] * len(evals[metric]),
        results_arr=evals[metric],
        metric=metric,
    )
    deep_dfs[metric] = deep_df
    mean_dfs[metric] = mean_df

In [19]:
mean_scores_df = pd.concat(
    [mdf.reset_index() for _, mdf in mean_dfs.items()],
    axis=0,
    ignore_index=True,
)
mean_scores_df = mean_scores_df.set_index("index")
mean_scores_df.index = mean_scores_df.index.set_names(["metrics"])
mean_scores_df

rag,baseline
metrics,Unnamed: 1_level_1
mean_answer_relevancy_score,0.98
mean_context_relevancy_score,0.8925


# Faithfulness

In [20]:
from llama_index.core import (
    VectorStoreIndex,
    SimpleDirectoryReader,
    Response,
)
from llama_index.llms.openai import OpenAI
from llama_index.core.evaluation import FaithfulnessEvaluator
from llama_index.core.node_parser import SentenceSplitter
import pandas as pd

pd.set_option("display.max_colwidth", 0)

In [21]:
# gpt-4
gpt4 = OpenAI(temperature=0, model="gpt-4o-mini")

evaluator_gpt4 = FaithfulnessEvaluator(llm=gpt4)

In [22]:
from llama_index.core.evaluation import EvaluationResult


# define jupyter display function
def display_eval_df(response: Response, eval_result: EvaluationResult) -> None:
    if response.source_nodes == []:
        print("no response!")
        return
    eval_df = pd.DataFrame(
        {
            "Response": str(response),
            "Source": response.source_nodes[0].node.text[:1000] + "...",
            "Evaluation Result": "Pass" if eval_result.passing else "Fail",
            "Reasoning": eval_result.feedback,
        },
        index=[0],
    )
    eval_df = eval_df.style.set_properties(
        **{
            "inline-size": "600px",
            "overflow-wrap": "break-word",
        },
        subset=["Response", "Source"]
    )
    display(eval_df)

In [26]:
eval_questions = [s['question'][10:] for s in ds['train']]

In [28]:
eval_questions[:5]

['When is the scheduled launch date and time for the PSLV-C56 mission, and where will it be launched from?',
 'What is the objective of the Uzbekistan-Afghanistan-Pakistan Railway Project and how is it expected to enhance trade and logistics efficiency?',
 'When was PharmaCann founded and what is its headquarters location?',
 'Who directed the film Oppenheimer and who stars as J. Robert Oppenheimer in the film?',
 'What is theranostics and how does it combine diagnostic and therapeutic approaches in precision medicine?']

In [29]:
import asyncio


def evaluate_query_engine(query_engine, questions):
    c = [query_engine.aquery(q) for q in questions]
    results = asyncio.run(asyncio.gather(*c))
    print("finished query")

    total_correct = 0
    for r in results:
        # evaluate with gpt 4
        eval_result = (
            1 if evaluator_gpt4.evaluate_response(response=r).passing else 0
        )
        total_correct += eval_result

    return total_correct, len(results)

In [30]:
query_engine = chatHandler.index.as_query_engine()

In [32]:
correct, total = evaluate_query_engine(query_engine, eval_questions[:50])

print(f"score: {correct}/{total}")

finished query
score: 50/50
