# Chroma Database Generation

# Library

In [1]:
import langchain_community
import langchain_text_splitters
from langchain_community.document_loaders import PyPDFLoader, pdf
from langchain_community.embeddings.sentence_transformer import (
    SentenceTransformerEmbeddings,
)
from langchain_community.vectorstores import Chroma
from langchain_text_splitters import CharacterTextSplitter, RecursiveCharacterTextSplitter
import os
import pprint
import re
from langchain_core.documents import Document
from langchain_core.prompts.prompt import PromptTemplate
from langchain_core.output_parsers import JsonOutputParser
from langchain_ollama import ChatOllama
from pydantic import BaseModel, Field
import json
import uuid
import chromadb
from chromadb.config import Settings
import unicodedata
from langchain_google_genai import GoogleGenerativeAI
import uuid
# from chromadb.utils.embedding_functions import SentenceTransformerEmbeddingFunction
import pickle as pkl
import requests
import subprocess
import pandas as pd
from bs4 import BeautifulSoup
import tqdm
import time
from sentence_transformers import SentenceTransformer
from langchain_huggingface import HuggingFaceEmbeddings
import datetime
import time
import random
from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
with open('api_google.txt') as f:
    
    api_key = json.load(f)

In [3]:
if not os.environ.get("GOOGLE_API_KEY"):
  os.environ["GOOGLE_API_KEY"] = api_key['key']

In [4]:
from langchain.chat_models import init_chat_model
llm = init_chat_model("gemini-2.5-flash", model_provider="google_genai",temperature=0,max_output_tokens=1024) # gemma-3-27b-it

## Load data

In [5]:
with open("info_articles_main.pkl","rb") as f:
    info_articles_main = pkl.load(f)
with open("info_articles_ref_final.pkl","rb") as f:
    info_articles_ref = pkl.load(f)

## Database

### Split the text

In [10]:
splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=100,
    separators=["\n\n", "\n", ".", "!", "?", " "]  # smart splitting
)

In [11]:
info_articles_final = info_articles_main + info_articles_ref
len(info_articles_final)

265

In [12]:
info_splitted = []

for j in info_articles_final:

    for key, value in j.items():
    
        if key in ['Abstract', 'Introduction', 'Methods', 'Results', 'Discussion', 'Conclusion',] and value:

            if len(value) > 1200:
                chunks = splitter.split_text(value)

                for i, c in enumerate(chunks):

                    info_splitted.append(
                        {
                            "chunk_index":i,
                            "content": j.get('Authors').split(",")[0]+" et al.,"+j.get('Publication',"Not identified")+", DOI:"+j.get("DOI")+":\n "+c,
                            "parent":key,
                            "split":True,
                            "DOI":j.get("DOI"),
                            "Reference": j.get('Authors').split(",")[0]+" et al.,"+j.get('Publication',"Not identified")
                        }
                    )
            else:

                info_splitted.append(
                        {
                            "chunk_index":0,
                            "content":j.get('Authors').split(",")[0]+" et al.,"+j.get('Publication',"Not identified")+", DOI:"+j.get("DOI")+":\n "+value,
                            "parent":key,
                            "split":False,
                            "DOI":j.get("DOI"),
                            "Reference": j.get('Authors').split(",")[0]+" et al.,"+j.get('Publication',"Not identified")
                        }
                    )

## Embedding

We chose this embedding according to leaderboard of HuggingFace

In [6]:
embedding_function2 = HuggingFaceEmbeddings(model_name="avsolatorio/GIST-small-Embedding-v0", model_kwargs={'device': 'cuda'}) # model_kwargs={'device': 'cuda'}

## Chroma

In [9]:
# 2. Prepare documents, metadata, and IDs
texts = [chunk["content"] for chunk in info_splitted]
metadatas = [{"parent": chunk["parent"], "chunk_index": chunk["chunk_index"],"DOI": chunk["DOI"], "Reference": chunk["Reference"]} for chunk in info_splitted]
ids = [str(uuid.uuid1()) for _ in metadatas]

NameError: name 'info_splitted' is not defined

In [13]:
db = Chroma.from_texts(
    texts=texts,
    embedding=embedding_function2,
    metadatas=metadatas,
    ids=ids,
    collection_name="ReproRAG",
    persist_directory="./chromaRepro"
)

We check that the search works properly

In [54]:
db.similarity_search("Is there a signature to predict endometrial disruption?", 2)

[Document(metadata={'chunk_index': 10, 'Reference': 'P. Diaz-Gimeno et al.,2022', 'DOI': 'https://doi.org/10.1093/humrep/deab262', 'parent': 'Results'}, page_content='P. Diaz-Gimeno et al.,2022, DOI:https://doi.org/10.1093/humrep/deab262\nPredictive performance comparison of signatures and consistency across endometrial datasets'),
 Document(metadata={'DOI': 'https://doi.org/10.1016/j.fertnstert.2024.03.015', 'parent': 'Abstract', 'Reference': 'Patricia Diaz-Gimeno et al.,2024', 'chunk_index': 0}, page_content='Patricia Diaz-Gimeno et al.,2024, DOI:https://doi.org/10.1016/j.fertnstert.2024.03.015\nObjective: To propose a new gene expression signature that identifies endometrial disruptions independent of endometrial luteal phase timing and predicts if patients are at risk of endometrial failure. Design: Multicentric, prospective study. Setting: Reproductive medicine research department in a public hospital affiliated with private fertility clinics and a reproductive genetics laboratory

## Evaluation

### Agents for questions

In [7]:
def call_llm(llm, prompt):
    response = llm.invoke(prompt)
    return response.content

In [8]:
QA_generation_prompt = """
You are given a piece of scientific text (context).
Your task is to generate ONE question and ONE answer from it.

Guidelines for the question:
- It must be factual and answerable using the context only.
- Phrase it naturally, as if a researcher typed it into a search engine.
- Do NOT mention "context", "passage", or "according to the text".
- The question should be specific and concise.

Guidelines for the answer:
- The answer must be a short, factual statement directly supported by the context.
- Do not add explanations, speculation, or references to the text.

Formatting rules (strict):
Output:::
Question: <your question here>
Answer: <your answer here>

Now here is the context:

{context}

Output:::
"""

In [13]:
info_splitted_evaluation = [d for d in info_splitted if d['parent'] in ['Abstract','Introduction','Results','Conclusion','Discussion','Methods']]

In [97]:
mynumb=10

print(info_splitted_evaluation[mynumb])

for d in info_splitted_evaluation:
    
    if d['Reference'] == info_splitted_evaluation[mynumb]['Reference'] and d['parent'] ==  info_splitted_evaluation[mynumb]['parent']:

        print(d)

        if d['chunk_index'] == 0:
            info = d['content']
        else:
            info+=d['content'].split(":\n ")[-1]

print(info)

{'chunk_index': 1, 'content': 'Almudena Devesa-Peiro et al.,2020, DOI:https://doi.org/10.1016/j.fertnstert.2020.01.025:\n . The search identified experiments involving human endometrial transcriptomic case versus control raw data related to uterine pathologies and implantation alterations. The keywords employed in the search included endometriosis, endometrial adenocarcinoma (ADC), recurrent implantation failure (RIF), and recurrent pregnancy loss (RPL), among others (Supplemental Table 1A, available online, for a full list of search terms). No restrictions were placed on publication date or language. Uterine leiomyoma, adenomyosis, and uterine leiomyosarcoma data were not included due to a lack of suitable studies meeting our criteria. For each sample cohort belonging to the same individual study, 39 variables were evaluated (see Supplemental Table 1B), including clinical characteristics of the participants (e.g., age and body mass index), experimental design (e.g., endometrial biopsy

In [14]:
def get_context(piece_of_paper, all_papers):

    for d in all_papers:
    
        if d['Reference'] == piece_of_paper['Reference'] and d['parent'] ==  piece_of_paper['parent']:

            if d['chunk_index'] == 0:
                info = d['content']
            else:
                info+=d['content'].split(":\n ")[-1]

    return(info)


In [15]:
N = 600
examples = []
for sample in tqdm(random.sample(info_splitted_evaluation,N), total=N):
    context = get_context(piece_of_paper=sample, all_papers=info_splitted_evaluation)
    response = call_llm(llm=llm,prompt=QA_generation_prompt.format(context=context))
    
    try:
        question = response.split("Question:")[-1].split("Answer: ")[0].strip()
        answer = response.split("Answer: ")[-1].strip()
        examples.append({
            "context" : context,
            "question" : question,
            "answer" : answer
        })
    except Exception as e:
        continue

with open("examples_evaluation_gemini_2-5.pkl","wb") as f:
    pkl.dump(examples, f)

100%|██████████| 600/600 [48:13<00:00,  4.82s/it]


## Evaluation of questions generated

In [16]:
question_groundedness_critique_prompt = """
You will be given a context and a question.
Your task is to provide a 'total rating' scoring how well one can answer the given question unambiguously with the given context.
Give your answer on a scale of 1 to 5, where 1 means that the question is not answerable at all given the context, and 5 means that the question is clearly and unambiguously answerable with the context.

Provide your answer as follows:

Answer:::
Evaluation: (your rationale for the rating, as a text)
Total rating: (your rating, as a number between 1 and 5)

You MUST provide values for 'Evaluation:' and 'Total rating:' in your answer.

Now here are the question and context.

Question: {question}\n
Context: {context}\n
Answer::: """

question_relevance_critique_prompt = """
You will be given a question.
Your task is to provide a 'total rating' representing how useful this question can be to researchers in the reproductive medicine field.
Give your answer on a scale of 1 to 5, where 1 means that the question is not useful at all, and 5 means that the question is extremely useful.

Provide your answer as follows:

Answer:::
Evaluation: (your rationale for the rating, as a text)
Total rating: (your rating, as a number between 1 and 5)

You MUST provide values for 'Evaluation:' and 'Total rating:' in your answer.

Now here is the question.

Question: {question}\n
Answer::: """

question_standalone_critique_prompt = """
You will be given a question.
Your task is to provide a 'total rating' representing how context-independent this question is.
Give your answer on a scale of 1 to 5, where 1 means that the question depends on additional information to be understood, and 5 means that the question makes sense by itself.
For instance, if the question refers to a particular setting, like 'in the context' or 'in the document', the rating must be 1.
The questions can contain obscure technical nouns or acronyms like Gradio, Hub, Hugging Face or Space and still be a 5: it must simply be clear to an operator with access to documentation what the question is about.

For instance, "What is the name of the checkpoint from which the ViT model is imported?" should receive a 1, since there is an implicit mention of a context, thus the question is not independent from the context.

Provide your answer as follows:

Answer:::
Evaluation: (your rationale for the rating, as a text)
Total rating: (your rating, as a number between 1 and 5)

You MUST provide values for 'Evaluation:' and 'Total rating:' in your answer.

Now here is the question.

Question: {question}\n
Answer::: """

In [17]:
print("Generating critique for each QA couple...")
for output in tqdm(examples,total=len(examples)):

    evaluations = {
        "groundedness": call_llm(
            llm,
            question_groundedness_critique_prompt.format(context=output["context"], question=output["question"]),
        ),
        "relevance": call_llm(
            llm,
            question_relevance_critique_prompt.format(question=output["question"]),
        ),
        "standalone": call_llm(
            llm,
            question_standalone_critique_prompt.format(question=output["question"]),
        ),
    }
    try:
        for criterion, evaluation in evaluations.items():
            score, eval = (
                int(evaluation.split("Total rating: ")[-1].strip()),
                evaluation.split("Total rating: ")[-2].split("Evaluation: ")[1],
            )
            output.update(
                {
                    f"{criterion}_score": score,
                    f"{criterion}_eval": eval,
                }
            )
    except:
        continue

Generating critique for each QA couple...


100%|██████████| 600/600 [56:53<00:00,  5.69s/it]  


In [20]:
generated_questions = pd.DataFrame.from_dict(examples)
generated_questions.loc[:,["question","context","answer","groundedness_score","relevance_score","standalone_score"]]
with open("generated_questions_gemini_2-5.pkl","wb") as f:
    pkl.dump(generated_questions, f)

In [19]:
generated_questions_final = generated_questions.loc[
    (generated_questions["groundedness_score"] >= 4)
    & (generated_questions["relevance_score"] >= 4)
    & (generated_questions["standalone_score"] >= 4)
]
with open("generated_questions_final_gemini_2-5.pkl","wb") as f:
    pkl.dump(generated_questions_final, f)

In [12]:
with open("generated_questions_final.pkl","rb") as f:
    generated_questions_final = pkl.load(f)

In [21]:
generated_questions_final.loc[:,["question","answer","groundedness_score","relevance_score","standalone_score"]]

Unnamed: 0,question,answer,groundedness_score,relevance_score,standalone_score
6,How does the FGB rs1800790A allele affect fibr...,"In F13A 34Val/Val wildtypes, carriage of the F...",5.0,5.0,5.0
20,For which patient group might the ERA test be ...,The ERA test may be helpful for women with sus...,5.0,5.0,5.0
25,What kind of values does the Color Pathway too...,The Color Pathway tool accepts numerical values.,5.0,4.0,5.0
32,What is the implantation potential of an euplo...,"Once an euploid blastocyst is identified, its ...",5.0,5.0,5.0
38,What does the PRISMA 2020 statement reflect?,The PRISMA 2020 statement reflects advances in...,5.0,5.0,5.0
...,...,...,...,...,...
566,Which genes share genetic susceptibility for A...,"The ESR1, HK3, and BRSK1 genes share genetic s...",5.0,5.0,5.0
573,What database were the GSE26787 and GSE63901 d...,The Gene Expression Omnibus (GEO) database.,5.0,4.0,5.0
576,What percentage of women globally are affected...,3.7% of women globally.,5.0,5.0,5.0
589,What is the purpose of unique molecular identi...,Unique molecular identifiers are applied to ov...,5.0,5.0,5.0


In [22]:
eval_dataset = generated_questions_final.to_dict("records")

### Create a function to check RAG performance

In [23]:
llm_reader = init_chat_model("gemini-2.0-flash", model_provider="google_genai",temperature=0.5,max_output_tokens=1024) # gemma-3-27b-it

In [24]:
def load_embeddings(
    documents,
    chunk_size: int,
    embedding_model
):
    splitter = RecursiveCharacterTextSplitter(
    chunk_size=chunk_size,
    chunk_overlap=chunk_size//10,
    separators=["\n\n", "\n", ".", "!", "?", " "]  # smart splitting
    )

    info_splitted = []

    for j in documents:

        for key, value in j.items():
        
            if key in ['Abstract', 'Introduction', 'Methods', 'Results', 'Discussion', 'Conclusion',] and value:

                if len(value) > 1200:
                    chunks = splitter.split_text(value)

                    for i, c in enumerate(chunks):

                        info_splitted.append(
                            {
                                "chunk_index":i,
                                "content": j.get('Authors').split(",")[0]+" et al.,"+j.get('Publication',"Not identified")+", DOI:"+j.get("DOI")+"\n"+c,
                                "parent":key,
                                "split":True,
                                "DOI":j.get("DOI"),
                                "Reference": j.get('Authors').split(",")[0]+" et al.,"+j.get('Publication',"Not identified")
                            }
                        )
                else:

                    info_splitted.append(
                            {
                                "chunk_index":0,
                                "content":j.get('Authors').split(",")[0]+" et al.,"+j.get('Publication',"Not identified")+", DOI:"+j.get("DOI")+"\n"+value,
                                "parent":key,
                                "split":False,
                                "DOI":j.get("DOI"),
                                "Reference": j.get('Authors').split(",")[0]+" et al.,"+j.get('Publication',"Not identified")
                            }
                        )
    
    texts = [chunk["content"] for chunk in info_splitted]
    metadatas = [{"parent": chunk["parent"], "chunk_index": chunk["chunk_index"],"DOI": chunk["DOI"], "Reference": chunk["Reference"]} for chunk in info_splitted]
    ids = [str(uuid.uuid1()) for _ in metadatas]

    db = Chroma.from_texts(
    texts=texts,
    embedding=embedding_model,
    metadatas=metadatas,
    ids=ids,
)

    return(db)

In [25]:
RAG_PROMPT_TEMPLATE = """
<|system|>
Using the information contained in the context,
give a comprehensive answer to the question.
Respond only to the question asked, response should be concise and relevant to the question.
Provide the number of the source document when relevant.
If the answer cannot be deduced from the context, do not give an answer.</s>
<|user|>
Context:
{context}
---
Now here is the question you need to answer.

Question: {question}
</s>
<|assistant|>
"""

In [26]:
def clean_doi_links(text):
    """
    Replace problematic Unicode dashes (like non-breaking hyphen) with normal ASCII dashes.
    """
    return re.sub(r"[\u2010-\u2015\u2212]", "-", text)

def retrieve_context(question, k, database):
    results = database.similarity_search(question, k)
    selected_index = []
    ideal_chunks = []
    meta_selected = []

    def is_new_chunk(r, selected_index):
        next_chunk = "_".join([r["parent"], r["Reference"], str(r["chunk_index"] + 1)])
        prev_chunk = "_".join([r["parent"], r["Reference"], str(r["chunk_index"] - 1)])
        return next_chunk not in selected_index and prev_chunk not in selected_index

    for doc in results:
        r = doc.metadata

        if r["parent"] not in ["Journal", "DOI"] and is_new_chunk(r, selected_index):
            ii = "_".join([r["parent"], r["Reference"], str(r["chunk_index"])])
            selected_index.append(ii)

            candidates = db.get(
                where={"$and": [{"Reference": r["Reference"]}, {"parent": r["parent"]}]}
            )

            max_index = len(candidates["metadatas"]) - 1

            meta_selected.append(candidates["metadatas"])
            ideal_chunks.append(
                [
                    doc
                    for doc, meta in zip(
                        candidates["documents"], candidates["metadatas"]
                    )
                    if meta["chunk_index"]
                    in [
                        r["chunk_index"],
                        max(r["chunk_index"] - 1, 0),
                        min(r["chunk_index"] + 1, max_index),
                    ]
                ]
                )

        context = []
        for text, meta in zip(ideal_chunks, meta_selected):
            if meta:  # Only proceed if meta is not empty
                doi = (
                    clean_doi_links(meta[0]["DOI"])
                    if "DOI" in meta[0]
                    else "DOI not available"
                )
                context.append(
                    f"Summary:\n\n{''.join(text)}\n\n"
                )

    return(context)



In [27]:
def answer_with_rag(
    question: str,
    llm,
    database,
    num_docs_final: int = 7,
    recursive_chunk = False
):
    """Answer a question using RAG with the given knowledge index."""
    # Gather documents with retriever
    
    if  recursive_chunk:
        relevant_docs = retrieve_context(question=question, database=database,k=num_docs_final)
    else:
        relevant_docs = database.similarity_search(query=question, k=num_docs_final)
        relevant_docs = [doc.page_content for doc in relevant_docs]  # keep only the text


    # Build the final prompt
    context = "\nExtracted documents:\n"
    context += "".join([f"Document {str(i)}:::\n" + doc for i, doc in enumerate(relevant_docs)])

    final_prompt = RAG_PROMPT_TEMPLATE.format(question=question, context=context)

    # Redact an answer
    answer = llm.invoke(final_prompt)

    return answer.content, relevant_docs

In [28]:
def run_rag_tests(
    eval_dataset,
    llm,
    database,
    output_file,
    verbose=False,
    test_settings = None,  # To document the test settings used
):
    """Runs RAG tests on the given dataset and saves the results to the given output file."""
    try:  # load previous generations if they exist
        with open(output_file, "r") as f:
            outputs = json.load(f)
    except:
        outputs = []

    for example in tqdm(eval_dataset):
        question = example["question"]
        if question in [output["question"] for output in outputs]:
            continue

        answer, relevant_docs = answer_with_rag(question, llm, database,recursive_chunk=True)
        if verbose:
            print("=======================================================")
            print(f"Question: {question}")
            print(f"Answer: {answer}")
            print(f'True answer: {example["answer"]}')
        result = {
            "question": question,
            "true_answer": example["answer"],
            "generated_answer": answer,
            "retrieved_docs": [doc for doc in relevant_docs],
        }
        if test_settings:
            result["test_settings"] = test_settings
        outputs.append(result)

        with open(output_file, "w") as f:
            json.dump(outputs, f)

In [29]:
from langchain.prompts.chat import (
    ChatPromptTemplate,
    HumanMessagePromptTemplate,
)
from langchain.schema import SystemMessage

EVALUATION_PROMPT = """###Task Description:
An instruction (might include an Input inside it), a response to evaluate, a reference answer that gets a score of 5, and a score rubric representing a evaluation criteria are given.
1. Write a detailed feedback that assess the quality of the response strictly based on the given score rubric, not evaluating in general.
2. After writing a feedback, write a score that is an integer between 1 and 5. You should refer to the score rubric.
3. The output format should look as follows: \"Feedback: {{write a feedback for criteria}} [RESULT] {{an integer number between 1 and 5}}\"
4. Please do not generate any other opening, closing, and explanations. Be sure to include [RESULT] in your output.

###The instruction to evaluate:
{instruction}

###Response to evaluate:
{response}

###Reference Answer (Score 5):
{reference_answer}

###Score Rubrics:
[Is the response correct, accurate, and factual based on the reference answer?]
Score 1: The response is completely incorrect, inaccurate, and/or not factual.
Score 2: The response is mostly incorrect, inaccurate, and/or not factual.
Score 3: The response is somewhat correct, accurate, and/or factual.
Score 4: The response is mostly correct, accurate, and factual.
Score 5: The response is completely correct, accurate, and factual.

###Feedback:"""

evaluation_prompt_template = ChatPromptTemplate.from_messages(
    [
        SystemMessage(content="You are a fair evaluator language model."),
        HumanMessagePromptTemplate.from_template(EVALUATION_PROMPT),
    ]
)

evaluation_prompt_template

ChatPromptTemplate(input_variables=['instruction', 'reference_answer', 'response'], input_types={}, partial_variables={}, messages=[SystemMessage(content='You are a fair evaluator language model.', additional_kwargs={}, response_metadata={}), HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['instruction', 'reference_answer', 'response'], input_types={}, partial_variables={}, template='###Task Description:\nAn instruction (might include an Input inside it), a response to evaluate, a reference answer that gets a score of 5, and a score rubric representing a evaluation criteria are given.\n1. Write a detailed feedback that assess the quality of the response strictly based on the given score rubric, not evaluating in general.\n2. After writing a feedback, write a score that is an integer between 1 and 5. You should refer to the score rubric.\n3. The output format should look as follows: "Feedback: {{write a feedback for criteria}} [RESULT] {{an integer number between 1 and

In [30]:
def evaluate_answers(
    answer_path: str,
    eval_chat_model,
    evaluator_name: str,
    evaluation_prompt_template: ChatPromptTemplate,
) -> None:
    """Evaluates generated answers. Modifies the given answer file in place for better checkpointing."""
    answers = []
    if os.path.isfile(answer_path):  # load previous generations if they exist
        answers = json.load(open(answer_path, "r"))

    for experiment in tqdm(answers):
        if f"eval_score_{evaluator_name}" in experiment:
            continue

        eval_prompt = evaluation_prompt_template.format_messages(
            instruction=experiment["question"],
            response=experiment["generated_answer"],
            reference_answer=experiment["true_answer"],
        )
        eval_result = eval_chat_model.invoke(eval_prompt)
        feedback, score = [item.strip() for item in eval_result.content.split("[RESULT]")]
        experiment[f"eval_score_{evaluator_name}"] = score
        experiment[f"eval_feedback_{evaluator_name}"] = feedback

        with open(answer_path, "w") as f:
            json.dump(answers, f)

In [32]:
if not os.path.exists("./output"):
    os.mkdir("./output")

In [33]:
for chunk_size in [1500,1700,1800]:  # Add other chunk sizes (in tokens) as needed
        
    settings_name = f"chunk:{chunk_size}_reader-model:gemini-2.0-flash"
    output_file_name = f"./output/rag_{settings_name}.json"

    print(f"Running evaluation for {settings_name}:")

    print("Loading knowledge base embeddings...")

    db = load_embeddings(
        info_articles_final,
        chunk_size=chunk_size,
        embedding_model=embedding_function2,
    )

    print("Running RAG...")
    run_rag_tests(
        eval_dataset=eval_dataset,
        llm=llm_reader,
        database=db,
        output_file=output_file_name,
        verbose=False,
        test_settings=settings_name,
    )

    print("Running evaluation...")
    evaluate_answers(
        output_file_name,
        llm_reader,
        "gemini-2.0-flash",
        evaluation_prompt_template,
    )
    print("Removing database")
    db.delete_collection()

Running evaluation for chunk:1500_reader-model:gemini-2.0-flash:
Loading knowledge base embeddings...
Running RAG...


100%|██████████| 70/70 [02:02<00:00,  1.75s/it]


Running evaluation...


100%|██████████| 63/63 [01:18<00:00,  1.24s/it]


Removing database
Running evaluation for chunk:1700_reader-model:gemini-2.0-flash:
Loading knowledge base embeddings...
Running RAG...


100%|██████████| 70/70 [00:47<00:00,  1.48it/s]


Running evaluation...


100%|██████████| 63/63 [00:41<00:00,  1.51it/s]


Removing database
Running evaluation for chunk:1800_reader-model:gemini-2.0-flash:
Loading knowledge base embeddings...
Running RAG...


100%|██████████| 70/70 [00:41<00:00,  1.67it/s]


Running evaluation...


100%|██████████| 63/63 [00:59<00:00,  1.05it/s]


Removing database


In [34]:
import glob

outputs = []
for file in glob.glob("./output/*.json"):
    output = pd.DataFrame(json.load(open(file, "r")))
    output["settings"] = file
    outputs.append(output)
result = pd.concat(outputs)

In [35]:
result["eval_score_gemini-2.0-flash"] = result["eval_score_gemini-2.0-flash"].apply(lambda x: int(x) if isinstance(x, str) else 0)

In [36]:
result["eval_score_gemini-2.0-flash"] = result["eval_score_gemini-2.0-flash"]/5

In [37]:
average_scores = result.groupby("settings")["eval_score_gemini-2.0-flash"].mean()
average_scores.sort_values()

settings
./output/rag_chunk:1500_reader-model:gemini-2.0-flash.json    0.857143
./output/rag_chunk:1700_reader-model:gemini-2.0-flash.json    0.907937
./output/rag_chunk:1800_reader-model:gemini-2.0-flash.json    0.907937
Name: eval_score_gemini-2.0-flash, dtype: float64