In [1]:
import os
import sys
from tqdm.auto import tqdm
import pandas as pd
import random
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

sys.path.append(os.path.abspath("../"))
import src.data.indexing as data_indexing
from src.utils.generation import call_llm_openrouter
from src.utils.embeddings import get_all_chroma_embeddings, get_embedding_model
from src.utils.text_processing import format_dictionary_pairs
from src.config import PROCESSED_DATA_DIR, DEFAULT_QA_MODEL
from src.templates.prompt_templates import QA_GENERATION_TEMPLATE

  from .autonotebook import tqdm as notebook_tqdm


For the evaluation task, mainly deepeval library will be used. 

In [11]:
N_GENERATIONS=2
SIMILARITY_THRESHOLD = 0.93

In [6]:
outputs = []
used_questions = {}
embedding_model = get_embedding_model()

Loading embedding model: intfloat/multilingual-e5-large-instruct
Embedding model ready to use.


I believe that the best possible case would be manual creation of the answer-question pairs. However, due to the time constraints it is not possible, therefore we will use an LLM, as described in https://huggingface.co/learn/cookbook/en/rag_evaluation

I am aware of deepevals synthesiser, however, I used custom solution for better flexibility and research on how the QA generation works. 

In [12]:
chroma_data = get_all_chroma_embeddings(embedding_model)
doc_embeddings = chroma_data['embeddings']
docs = chroma_data['documents']
doc_ids = chroma_data['ids']
doc_metadatas = chroma_data['metadatas']

# print(f"Docs metadata: {doc_metadatas}")
# print(f"Docs: {docs}")

similarity_matrix = cosine_similarity(doc_embeddings)

doc_groups = []
used_docs = set()

for i in range(len(docs)):
    if i in used_docs:
        continue
        
    current_group = [i]
    used_docs.add(i)
    
    similar_indices = np.where(similarity_matrix[i] > SIMILARITY_THRESHOLD)[0]
    for idx in similar_indices:
        if idx != i and idx not in used_docs:
            current_group.append(idx)
            used_docs.add(idx)
    
    doc_groups.append(current_group)

print(f"Doc groups: {doc_groups}")

Doc groups: [[0, np.int64(6), np.int64(20)], [1], [2, np.int64(30), np.int64(79)], [3, np.int64(26)], [4, np.int64(28)], [5], [7], [8, np.int64(35)], [9], [10, np.int64(41), np.int64(55)], [11, np.int64(16)], [12], [13, np.int64(22), np.int64(71)], [14, np.int64(76)], [15], [17, np.int64(37)], [18], [19], [21], [23], [24], [25, np.int64(68)], [27], [29], [31], [32], [33], [34], [36], [38], [39], [40], [42], [43, np.int64(52), np.int64(56)], [44], [45], [46, np.int64(62)], [47, np.int64(54)], [48, np.int64(61)], [49], [50], [51], [53], [57], [58], [59], [60], [63], [64], [65, np.int64(85)], [66], [67], [69, np.int64(80)], [70], [72], [73], [74], [75], [77], [78], [81], [82], [83, np.int64(89)], [84], [86], [87, np.int64(88)], [90], [91], [92], [93], [94], [95], [96, np.int64(99)], [97], [98]]


In [None]:

for doc in tqdm(random.sample(docs, N_GENERATIONS)):
    group = random.choice(doc_groups)
    n_docs = random.randint(1, 3)    
    
    formatted_qa = format_dictionary_pairs(used_questions)
    QA_PROMTP = QA_GENERATION_TEMPLATE.format(context=doc.page_content, qa_pairs=formatted_qa)
    output_QA_couple = call_llm_openrouter(QA_PROMTP, DEFAULT_QA_MODEL, {"question": "question", "answer": "answer"})
    used_questions[output_QA_couple["question"]] = output_QA_couple["answer"]
    outputs.append({
        "query": output_QA_couple["question"],
        "expected_output": output_QA_couple["answer"],
        "actual_output": "",  
        "context": doc.page_content,  
        "retrieval_context": ""
    })
    print(f"Question: {output_QA_couple['question']}")
    print(f"Answer: {output_QA_couple['answer']}")
    
qa_df = pd.DataFrame(outputs)
csv_path = f"../data/evaluation/qa_pairs.csv"
os.makedirs(os.path.dirname(csv_path), exist_ok=True)

qa_df.to_csv(csv_path, index=False)
print(f"Saved {len(outputs)} QA pairs to {csv_path}")

  0%|          | 0/2 [00:00<?, ?it/s]

['question', 'answer']
{'model': 'google/gemini-2.0-flash-001', 'messages': [{'role': 'user', 'content': 'Your task is to generate a question and its corresponding answer based on the provided recipe context. \n\n# Instructions:\n- Use czech language for both questions and answers.\n\n## Question Guidelines\n- Create questions that reflect realistic user scenarios:\n  - COOKING PHASE: Questions about techniques, ingredients, or clarification of steps. Be creative about what kind of questions a user might ask while cooking.\n  - SELECTION PHASE: Questions about dietary fit, ingredient requirements, general dish selection, cuisine characteristics, what dishes can be made with certain ingredients, or recipes that fit user preferences.\n- The question must be answerable given the information from the context. DO NOT create questions that require external knowledge.\n- Vary in complexity (some factual, some requiring inference). \n- Use natural, conversational language. \n- Avoid mentioning

 50%|█████     | 1/2 [00:01<00:01,  1.46s/it]

Question: Čím ještě se dají dochutit hotová grilovaná kuřecí srdíčka?
Answer: Kromě olivového oleje a citronové šťávy se k hotovým grilovaným srdíčkům hodí i lžíce nasekaného čerstvého koriandru.
['question', 'answer']
{'model': 'google/gemini-2.0-flash-001', 'messages': [{'role': 'user', 'content': 'Your task is to generate a question and its corresponding answer based on the provided recipe context. \n\n# Instructions:\n- Use czech language for both questions and answers.\n\n## Question Guidelines\n- Create questions that reflect realistic user scenarios:\n  - COOKING PHASE: Questions about techniques, ingredients, or clarification of steps. Be creative about what kind of questions a user might ask while cooking.\n  - SELECTION PHASE: Questions about dietary fit, ingredient requirements, general dish selection, cuisine characteristics, what dishes can be made with certain ingredients, or recipes that fit user preferences.\n- The question must be answerable given the information from 

100%|██████████| 2/2 [00:02<00:00,  1.30s/it]

Question: S čím se guacamole obvykle podává?
Answer: Guacamole se podává s nachos.
Saved 2 QA pairs to ../data/evaluation/qa_pairs.csv



