# Evaluation of chatbot answers

## Evaluation Runs

1. 09/09/2024: 77/77 [45:17<00:00, 35.29s/it] using only "mistral", reranker "rrf"
2. `20240911_101327`: 77/77 [46:15<00:00, 36.05s/it] testing many models, reranker "rrf"
   - show in the documentation
3. `20240912_105808`: 77/77 [33:16<00:00, 25.93s/it] testing many models, reranker "cross-encoder"
    - Not very different to previous run respite different reranker. Not surprising since it should be very easy to retrieve the correct text chunk for the selected titles
4. `20240912_114908` : 77/77 [39:39<00:00, 30.90s/it] testing many models, reranker cross-encoder, with enriching top-retrieved blog post (enrich_first=True)
   - Not very different to previous runs. Not surprising as the main text chunk should be sufficiently to give a good answer

## Libraries

In [None]:
# reload the external files every time before executing any cell
%load_ext autoreload
%autoreload 2

In [19]:
import json
import os
import time
import warnings
from collections.abc import Iterable
from pathlib import Path
from typing import Any

import lancedb
import pandas as pd
from dotenv import load_dotenv
from groq import Groq
from lancedb.table import Table as KBaseTable
from matplotlib import pyplot as plt
from sentence_transformers import SentenceTransformer
from tqdm import tqdm

from src.constants import (
    GROUND_TRUTH_FILE,
    GROUND_TRUTH_PATH,
    LANCEDB_URI,
    REPO_PATH,
    get_rag_config,
)
from src.llm_api import (
    build_full_llm_chat_input,
    get_llm_api_client_object,
    get_model_list,
    get_preferred_model,
)
from src.prompt_building import WELCOME_MSG, extract_context_from_msg

# to ignore warnings
warnings.simplefilter(action="ignore", category=FutureWarning)
warnings.simplefilter(action="ignore", category=UserWarning)

## Parameters

In [20]:
table_name: str = get_rag_config()["knowledge_base"]["table_name"]
emb_config: dict = get_rag_config()["embeddings"]

# LLM Parameters
# -----------------------------
llm_api_config: dict[str, Any] = get_rag_config()["llm"]
LLM_TEMP: float = llm_api_config["settings"]["model_temp"]
LLM_API_NAME: str = llm_api_config["settings"]["api_name"]
LLM_API_CONFIG: dict[str, Any] = llm_api_config["api"][LLM_API_NAME]
LLM_API_KEY_NAME: str = LLM_API_CONFIG["key_name"]
LLM_API_KEY_URL: str = LLM_API_CONFIG["key_url"]


# Secrets
# -----------------------------
load_dotenv(REPO_PATH)
LLM_API_KEY: str = os.getenv(LLM_API_KEY_NAME)

# Paths
ts: str = time.strftime("%Y%m%d_%H%M%S")
print(f"Evaluation Run Timestamp: {ts}")
eva_rag_results: Path = GROUND_TRUTH_PATH / f"eva_rag_results_{ts}.json"
eva_rag_similarity: Path = GROUND_TRUTH_PATH / f"eva_rag_similarity_{ts}.csv"

# retriever config
retriever_config: dict = get_rag_config()["retriever"]
retriever_config["enrich_first"] = True  # `False` for tests before 12/09/2024

# Code

## Load Knowledge Base

In [None]:
db: lancedb.db.DBConnection = lancedb.connect(uri=LANCEDB_URI)
print(f"List of all tables in the LanceDB database: {db.table_names()}")

In [None]:
k_base: lancedb.table.Table = db.open_table(table_name)
print(f"Number of entries in the table '{table_name}': {k_base.count_rows()}")

## Embeddings

In [23]:
# Measure similarity between titles
emb_model = SentenceTransformer(
    emb_config["model_name"],
    device=emb_config["device"],
    similarity_fn_name=emb_config["metric"],
)

## Load Ground Truth

In [None]:
# only take ground truth entries with where text chunk of the title has at least a cosine similarity of 0.8
ground_truth = pd.read_csv(GROUND_TRUTH_FILE).loc[lambda df: df["best_doc_sim"] > 0.8]
print(f"Number of entries in the ground truth: {len(ground_truth)}")

In [None]:
# Ground Truth: Average Similarity of text chunk to its title
print(f"Average similarity: {ground_truth['best_doc_sim'].mean():.2f}+-{ground_truth['best_doc_sim'].std():.2f}")

## Connect to LLM

In [26]:
def connect_to_llm(api_key: str, api_name: str, api_config: dict, model_name: str = "") -> tuple[Groq, str]:
    # Setup Model Name
    if not model_name:
        models_url: str = api_config.get("models", {}).get("url", "")
        ranked_models: list[str] = api_config.get("models", {}).get("ranked", [])
        model_name = get_preferred_model(api_key=api_key, models_url=models_url, ranked_models=ranked_models)

    # Setup LLM API Client
    llm_api_client = get_llm_api_client_object(api_name=api_name)
    return llm_api_client(api_key=api_key), model_name


# llm_api_client, model_name = connect_to_llm(api_key=LLM_API_KEY, api_name=LLM_API_NAME, api_config=LLM_API_CONFIG)

In [None]:
# get list of models
models_url: str = LLM_API_CONFIG.get("models", {}).get("url", "")
models_full_list: list[dict] = get_model_list(api_key=LLM_API_KEY, models_url=models_url)
# get active models
models_excluded: list[str] = ["whisper", "tool", "llava", "gemma-", "guard", "llama-3.1"]
# guard : classifier for safe or not safe text
# whisper : speech-to-text
# tool : for tool usage
# llava : Context Window: 4,096 tokens
# gemma- : older than gemma2
# llama-3.1 : in 'preview' stage
models_selected: list[str] = sorted(
    md["id"] for md in models_full_list if md["active"] and all(ex not in md["id"] for ex in models_excluded)
)
models_selected
# as of 11/09/2024: ['gemma2-9b-it', 'llama3-70b-8192', 'llama3-8b-8192', 'mixtral-8x7b-32768']

In [28]:
# waking up assistant, if needed
llm_api_client: Groq = get_llm_api_client_object(api_name=LLM_API_NAME)(api_key=LLM_API_KEY)
model_name = "mixtral-8x7b-32768"  # for test on 09/09/2024

## Evaluation

### Manual Testing

In [29]:
def process_user_input(
    user_prompt: str,
    k_base: KBaseTable,
    client: Groq,
    llm_temp: float,
    model_name: str,
) -> tuple[str, str]:
    # create chat history
    chat_history: list[dict[str, str]] = [
        {"role": "assistant", "content": WELCOME_MSG.format(user_name="John Doe")},
        {"role": "user", "content": user_prompt},
    ]

    # build LLM chat input
    messages: list[dict[str, str]] = build_full_llm_chat_input(
        user_prompt=user_prompt,
        chat_history=chat_history,
        k_base=k_base,
        retriever_config=retriever_config,
    )
    context: str = extract_context_from_msg(messages[0]["content"])

    # send message to LLM and get response
    response_raw: Iterable = client.chat.completions.create(
        messages=messages,
        model=model_name,
        temperature=llm_temp,
        stream=False,
    )
    txt_response: str = response_raw.choices[0].message.content

    return {"context": context, "txt_response": txt_response, "full_response": response_raw}

In [None]:
# testing
entry = ground_truth.iloc[0]
user_prompt: str = entry["title"].lower().replace(":", " ").replace("\u00a0", " ").strip()
print(user_prompt)

In [31]:
chat_config: dict = {"llm_temp": LLM_TEMP, "k_base": k_base, "client": llm_api_client, "model_name": model_name}
resp_dict = process_user_input(user_prompt=user_prompt, **chat_config)

In [None]:
print(resp_dict["txt_response"])

In [None]:
# get text chunk vector from knowledge base
hash_doc: str = entry["best_doc_hash"]
text_chunk_vec: list[float] = k_base.search().where(f"hash_doc = '{hash_doc}'").to_pandas().iloc[0]["vector"].tolist()
# create embedding vector of response
resp_vec: list[float] = emb_model.encode([resp_dict["txt_response"]])[0]
# compute similarity
emb_model.similarity([text_chunk_vec], [resp_vec]).tolist()[0][0]

### Loop: compute similarity between expected text chunk and generated LLM response

In [None]:
user_name = "John Doe"

pbar = tqdm(ground_truth.iterrows(), total=len(ground_truth))

eva_input: dict[str, dict[str, Any]] = {}
for _, entry in pbar:
    # get has of best doc for given title
    hash_doc: str = entry["best_doc_hash"]

    eva_input[hash_doc] = {}

    # prepare query
    user_prompt: str = entry["title"].lower().replace(":", " ").replace("\u00a0", " ").strip()
    eva_input[hash_doc]["query_text"] = user_prompt

    # create chat history
    chat_history: list[dict[str, str]] = [
        {"role": "assistant", "content": WELCOME_MSG.format(user_name=user_name)},
        {"role": "user", "content": user_prompt},
    ]

    # build LLM chat input
    prompt: list[dict[str, str]] = build_full_llm_chat_input(
        user_prompt=user_prompt,
        chat_history=chat_history,
        k_base=k_base,
        retriever_config=retriever_config,
    )
    eva_input[hash_doc]["prompt"] = prompt

    # get text chunk vector from knowledge base
    text_chunk_vec: list[float] = (
        k_base.search().where(f"hash_doc = '{hash_doc}'").to_pandas().iloc[0]["vector"].tolist()
    )
    eva_input[hash_doc]["text_chunk_vec"] = text_chunk_vec
# < 1 minutes

In [None]:
# read in
eva_dict: dict[str, dict[str, Any]]
if eva_rag_results.exists():
    with open(eva_rag_results) as f:
        eva_dict = json.load(f)
else:
    eva_dict = {}
print(f"Number fo evaluation results: {len(eva_dict)}")

In [42]:
def pbar_update(pbar: tqdm, txt: str) -> None:
    pbar.set_description(f"{txt:<30}")
    pbar.refresh()

In [None]:
pbar = tqdm(eva_input.items(), total=len(eva_input))
for hash_doc, data in pbar:
    # continue  # for testing

    if hash_doc in eva_dict:
        continue

    # build LLM chat input
    prompt: list[dict[str, str]] = data["prompt"]

    # get text chunk vector from knowledge base
    text_chunk_vec = data["text_chunk_vec"]

    # send message to LLM and get response
    out: dict = {"input": {"messages": prompt, "text_chunk_vec": text_chunk_vec}, "llm_rsp": {}}
    for model_name in models_selected:
        out["llm_rsp"][model_name] = {}

        pbar_update(pbar, f"'{model_name}': Sleeping...")
        time.sleep(0.5)  # avoid rate limit problems

        pbar_update(pbar, f"'{model_name}': Waiting for response ...")
        response_raw: Iterable = llm_api_client.chat.completions.create(
            messages=prompt, model=model_name, temperature=LLM_TEMP, stream=False
        )
        txt_response: str = response_raw.choices[0].message.content

        # ask LLM to process query
        pbar_update(pbar, f"'{model_name}': Saving response ...")
        out["llm_rsp"][model_name]["txt_response"] = txt_response
        out["llm_rsp"][model_name]["usage"] = dict(response_raw.usage)

        pbar_update(pbar, f"'{model_name}': Computing similarity...")
        # create embedding vector of response
        resp_vec: list[float] = emb_model.encode([txt_response])[0]
        # compute similarity
        out["llm_rsp"][model_name]["similarity"] = emb_model.similarity([text_chunk_vec], [resp_vec]).tolist()[0][0]

    # save results
    eva_dict[hash_doc] = out

#
# < 50 minutes

#### Save results

In [44]:
# save full results to json file
with open(eva_rag_results, "w") as f:
    json.dump(eva_dict, f, indent=4)

In [None]:
# For 09/09/2024 Evaluation results
# # save just the similarity scores
# df = pd.DataFrame().from_dict(eva_dict, orient="index")
# df.index.name = "hash_doc"
# df["similarity"].to_csv(eva_rag_similarity)

In [45]:
# since: 11/09/2024
out = {}
for hash_doc, data in eva_dict.items():
    out[hash_doc] = {model_name: data["llm_rsp"][model_name]["similarity"] for model_name in models_selected}
df = pd.DataFrame().from_dict(out, orient="index")
# set index name to hash_doc
df.index.name = "hash_doc"
# save
df.to_csv(eva_rag_similarity, index=True)

out = {}
for hash_doc, data in eva_dict.items():
    out[hash_doc] = {
        model_name: data["llm_rsp"][model_name]["usage"]["completion_time"] for model_name in models_selected
    }
df = pd.DataFrame().from_dict(out, orient="index")
df.index.name = "hash_doc"  # set index name to hash_doc
df.to_csv(GROUND_TRUTH_PATH / f"eva_rag_rsp_time_{ts}.csv", index=True)

### Analyze results

#### Multiple models (since 11/09/2024)

In [46]:
sim_answer = pd.read_csv(eva_rag_similarity, index_col=0)

In [None]:
# compute mean and std for each column
stats = pd.DataFrame({"mean": sim_answer.mean(), "std": sim_answer.std()}).sort_values(by="mean", ascending=False)
stats.round(3)

In [None]:
sim_answer.hist(bins=20, density=True)

In [None]:
merge = pd.concat([ground_truth.set_index("best_doc_hash")["best_doc_sim"], sim_answer], axis=1).rename(
    columns={"best_doc_sim": "Baseline"}
)
merge.head()

In [None]:
plt.figure(figsize=(15, 4))
axix = merge.plot.kde()
# ground_truth["best_doc_sim"].plot.kde(color="black", ls="--")
plt.xlabel("Cosine Similarity")
plt.ylabel("Density")
plt.grid(True)
plt.title("RAG evaluation: Similarity Distribution")
plt.xlim(0.7, 1)
plt.legend()  # list(sim_answer.columns) + ["baseline"])
plt.show()

In [None]:
axis = merge.plot.box(figsize=(7, 3), showfliers=False)
axis.set_ylabel("Cosine Similarity")
axis.set_title("RAG evaluation: Similarity")
# title x lables by 45 degree
# axis.set_xticklabels(axis.get_xticklabels(), rotation=45)
# reduce font size of x labels
axis.tick_params(axis="x", labelsize=8)
axis.grid(True)
# save figure
plt.savefig(eva_rag_similarity.parent / f"{eva_rag_similarity.stem}_sim_box.png", dpi=150)
plt.show()

In [None]:
time_answer = pd.read_csv(GROUND_TRUTH_PATH / f"eva_rag_rsp_time_{ts}.csv", index_col=0)
axis = time_answer.plot.box(figsize=(6, 3), showfliers=False)
axis.set_ylabel("Response Time (s)")
axis.set_title("RAG evaluation: Response Time")
axis.tick_params(axis="x", labelsize=8)
axis.grid(True)
plt.savefig(eva_rag_similarity.parent / f"{eva_rag_similarity.stem}_time_box.png", dpi=150)
plt.show()

#### Single model: mistral (09/09/2024)

In [None]:
sim_answer = pd.read_csv("data/ground_truth/eva_rag_similarity.csv", index_col=0)

In [None]:
# Average similarity between response and expected text chunk
print(f"Average similarity: {sim_answer['similarity'].mean():.2f}+-{sim_answer['similarity'].std():.2f}")

In [None]:
# compute probability density distribution and kde
plt.figure(figsize=(15, 4))

sim_answer["similarity"].hist(bins=20, density=True)
sim_answer["similarity"].plot.kde()
# show average and std as transparent vertical band

ground_truth["best_doc_sim"].plot.kde()
plt.xlabel("Cosine Similarity")
plt.ylabel("Density")
plt.grid(True)
plt.title("RAG evaluation: Similarity Distribution")
plt.xlim(0.7, 1)
plt.legend(["Generated answer vs expected answer", "Query vs expected answer"])
plt.show()

In [142]:
merge = pd.concat([ground_truth.set_index("best_doc_hash")["best_doc_sim"], sim_answer], axis=1)

In [None]:
# plt.figure(figsize=(8, 4))
axis = merge.plot.box(figsize=(6, 3), showfliers=False)
axis.set_ylabel("Cosine Similarity")
axis.set_title("RAG evaluation")
# change x tick labels to be more readable
tick_labels = ["Query vs Expected answer\n(Baseline)", "Generated answer\nvs Expected answer"]
axis.set_xticks(ticks=[1, 2], labels=tick_labels)
axis.grid(True)
plt.show()