# Evaluation of chatbot answers


## Libraries

In [70]:
import json
import os
import time
import warnings
from collections.abc import Iterable
from pathlib import Path
from typing import Any

import lancedb
import pandas as pd
from dotenv import load_dotenv
from groq import Groq
from lancedb.table import Table as KBaseTable
from matplotlib import pyplot as plt
from sentence_transformers import SentenceTransformer
from tqdm import tqdm

from src.constants import (
    GROUND_TRUTH_FILE,
    GROUND_TRUTH_PATH,
    LANCEDB_URI,
    REPO_PATH,
    get_rag_config,
)
from src.llm_api import (
    build_full_llm_chat_input,
    get_llm_api_client_object,
    get_preferred_model,
)
from src.prompt_building import WELCOME_MSG, extract_context_from_msg

# to ignore warnings
warnings.simplefilter(action="ignore", category=FutureWarning)
warnings.simplefilter(action="ignore", category=UserWarning)

## Parameters

In [136]:
table_name: str = get_rag_config()["knowledge_base"]["table_name"]
emb_config: dict = get_rag_config()["embeddings"]

# LLM Parameters
# -----------------------------
llm_api_config: dict[str, Any] = get_rag_config()["llm"]
LLM_TEMP: float = llm_api_config["settings"]["model_temp"]
LLM_API_NAME: str = llm_api_config["settings"]["api_name"]
LLM_API_CONFIG: dict[str, Any] = llm_api_config["api"][LLM_API_NAME]
LLM_API_KEY_NAME: str = LLM_API_CONFIG["key_name"]
LLM_API_KEY_URL: str = LLM_API_CONFIG["key_url"]


# Secrets
# -----------------------------
load_dotenv(REPO_PATH)
LLM_API_KEY: str = os.getenv(LLM_API_KEY_NAME)

# Paths
eva_rag_results: Path = GROUND_TRUTH_PATH / "eva_rag_results.json"
eva_rag_similarity: Path = GROUND_TRUTH_PATH / "eva_rag_similarity.csv"

# Code

## Load Knowledge Base

In [None]:
db: lancedb.db.DBConnection = lancedb.connect(uri=LANCEDB_URI)
print(f"List of all tables in the LanceDB database: {db.table_names()}")

In [None]:
k_base: lancedb.table.Table = db.open_table(table_name)
print(f"Number of entries in the table '{table_name}': {k_base.count_rows()}")

## Embeddings

In [7]:
# Measure similarity between titles
emb_model = SentenceTransformer(
    emb_config["model_name"],
    device=emb_config["device"],
    similarity_fn_name=emb_config["metric"],
)

## Load Ground Truth

In [None]:
# only take ground truth entries with where text chunk of the title has at least a cosine similarity of 0.8
ground_truth = pd.read_csv(GROUND_TRUTH_FILE).loc[lambda df: df["best_doc_sim"] > 0.8]
print(f"Number of entries in the ground truth: {len(ground_truth)}")

In [None]:
# Ground Truth: Average Similarity of text chunk to its title
print(f"Average similarity: {ground_truth['best_doc_sim'].mean():.2f}+-{ground_truth['best_doc_sim'].std():.2f}")

## Connect to LLM

In [9]:
def connect_to_llm(api_key: str, api_name: str, api_config: dict, model_name: str = "") -> tuple[Groq, str]:
    # Setup Model Name
    if not model_name:
        models_url: str = api_config.get("models", {}).get("url", "")
        ranked_models: list[str] = api_config.get("models", {}).get("ranked", [])
        model_name = get_preferred_model(api_key=api_key, models_url=models_url, ranked_models=ranked_models)

    # Setup LLM API Client
    llm_api_client = get_llm_api_client_object(api_name=api_name)
    return llm_api_client(api_key=api_key), model_name

In [10]:
# waking up assistant, if needed
llm_api_client: Groq
llm_api_client, model_name = connect_to_llm(api_key=LLM_API_KEY, api_name=LLM_API_NAME, api_config=LLM_API_CONFIG)

In [None]:
model_name

In [12]:
chat_config: dict = {"llm_temp": LLM_TEMP, "k_base": k_base, "client": llm_api_client, "model_name": model_name}

## Process User Input

In [13]:
def process_user_input(
    user_prompt: str,
    k_base: KBaseTable,
    client: Groq,
    llm_temp: float,
    model_name: str,
) -> tuple[str, str]:
    # create chat history
    chat_history: list[dict[str, str]] = [
        {"role": "assistant", "content": WELCOME_MSG.format(user_name="John Doe")},
        {"role": "user", "content": user_prompt},
    ]

    # build LLM chat input
    messages: list[dict[str, str]] = build_full_llm_chat_input(user_prompt, chat_history, k_base)
    context: str = extract_context_from_msg(messages[0]["content"])

    # send message to LLM and get response
    response_raw: Iterable = client.chat.completions.create(
        messages=messages,
        model=model_name,
        temperature=llm_temp,
        stream=False,
    )
    txt_response: str = response_raw.choices[0].message.content

    return {"context": context, "txt_response": txt_response, "full_response": response_raw}

In [None]:
# testing
entry = ground_truth.iloc[0]
query_text: str = entry["title"].lower().replace(":", " ").replace("\u00a0", " ").strip()
print(query_text)

In [24]:
resp_dict = process_user_input(user_prompt=query_text, **chat_config)

In [None]:
print(resp_dict["txt_response"])

In [None]:
# get text chunk vector from knowledge base
hash_doc: str = entry["best_doc_hash"]
text_chunk_vec: list[float] = k_base.search().where(f"hash_doc = '{hash_doc}'").to_pandas().iloc[0]["vector"].tolist()
# create embedding vector of response
resp_vec: list[float] = emb_model.encode([resp_dict["txt_response"]])[0]
# compute similarity
emb_model.similarity([text_chunk_vec], [resp_vec]).tolist()[0][0]

### Compute similarity between expected text chunk and created response

In [137]:
# read in
with open(eva_rag_results) as f:
    eva_dict = json.load(f)

In [34]:
# eva_dict: dict[str, dict[str, str | float]] = {}

In [None]:
pbar = tqdm(ground_truth.iterrows(), total=len(ground_truth))
for _, entry in pbar:
    continue  # for testing

    # get has of best doc for given title
    hash_doc: str = entry["best_doc_hash"]
    if hash_doc in eva_dict:
        continue

    pbar.set_description(f"{'sleeping...':<40}")
    pbar.refresh()
    time.sleep(0.5)  # avoid rate limit problems

    # if len(eva_dict) >= 2:
    #     break

    out: dict[str, str | float] = {}

    # prepare query
    title: str = entry["title"]
    query_text: str = title.lower().replace(":", " ").replace("\u00a0", " ").strip()
    out["query_text"] = query_text

    # ask LLM to process query
    pbar.set_description(f"{"Processing query...":<40}")
    pbar.refresh()
    resp_dict = process_user_input(user_prompt=query_text, **chat_config)
    txt_response: str = resp_dict["txt_response"]
    out["txt_response"] = txt_response
    out["context"] = resp_dict["context"]

    pbar.set_description(f"{"Computing similarity...":<40}")
    pbar.refresh()
    # get text chunk vector from knowledge base
    text_chunk_vec: list[float] = (
        k_base.search().where(f"hash_doc = '{hash_doc}'").to_pandas().iloc[0]["vector"].tolist()
    )
    # create embedding vector of response
    resp_vec: list[float] = emb_model.encode([txt_response])[0]
    # compute similarity
    out["similarity"] = emb_model.similarity([text_chunk_vec], [resp_vec]).tolist()[0][0]

    # save results
    eva_dict[hash_doc] = out

#  77/77 [45:17<00:00, 35.29s/it]

In [72]:
# # save results to json file
# with open(eva_rag_results, "w") as f:
#     json.dump(eva_dict, f, indent=4)

# # save just the similarity scores
# df = pd.DataFrame().from_dict(eva_dict, orient="index")
# df.index.name = "hash_doc"
# df["similarity"].to_csv(eva_rag_similarity)

#### Analyze results

In [139]:
sim_answer = pd.read_csv(eva_rag_similarity, index_col=0)

In [None]:
# Average similarity between response and expected text chunk
print(f"Average similarity: {sim_answer['similarity'].mean():.2f}+-{sim_answer['similarity'].std():.2f}")

In [None]:
# compute probability density distribution and kde
plt.figure(figsize=(15, 4))
sim_answer["similarity"].hist(bins=20, density=True)
sim_answer["similarity"].plot.kde()
# show average and std as transparent vertical band

ground_truth["best_doc_sim"].plot.kde()
plt.xlabel("Cosine Similarity")
plt.ylabel("Density")
plt.grid(True)
plt.title("RAG evaluation: Similarity Distribution")
plt.xlim(0.7, 1)
plt.legend(["Generated answer vs expected answer", "Query vs expected answer"])
plt.show()

In [142]:
merge = pd.concat([ground_truth.set_index("best_doc_hash")["best_doc_sim"], sim_answer], axis=1)

In [None]:
# plt.figure(figsize=(8, 4))
axis = merge.plot.box(figsize=(6, 3), showfliers=False)
axis.set_ylabel("Cosine Similarity")
axis.set_title("RAG evaluation")
# change x tick labels to be more readable
tick_labels = ["Query vs Expected answer\n(Baseline)", "Generated answer\nvs Expected answer"]
axis.set_xticks(ticks=[1, 2], labels=tick_labels)
axis.grid(True)
plt.show()