# Import Library

In [2]:
import pandas as pd
import sys
import os
from dotenv import load_dotenv
from datasets import Dataset
import json
import time

# RAG Evaluation (RAGAS)
from ragas.metrics import faithfulness, answer_relevancy, context_precision, context_recall
from ragas import evaluate
from ragas.llms import LangchainLLMWrapper
from ragas.embeddings import LangchainEmbeddingsWrapper

# LLM
from langchain_ollama.llms import OllamaLLM
from langchain_ollama import OllamaEmbeddings
from langchain_openai import ChatOpenAI
from langchain_openai.embeddings import OpenAIEmbeddings

# Pipeline
pipeline_path = os.path.abspath('../../pipeline')
if pipeline_path not in sys.path:
    sys.path.append(pipeline_path)
from retrieval_generation_pipeline import hypothesis_pipeline, retrieval_pipeline, generation_pipeline

# Load Dataset

In [2]:
def load_evaluation_dataset(file_path):
    """
    Args:
        file_path (str): Path to the evaluation dataset CSV file.
    
    Returns:
        evaluation_dataset (pd.DataFrame): DataFrame containing the evaluation dataset.
    """

    # Mengambil data dari file CSV
    evaluation_dataset = pd.read_csv(file_path)

    return evaluation_dataset

# Generate Answer Dataset

In [3]:
def generate_answer_dataset(url, api_key, collection_name, embedding_model_name, llm_name, eval_dataset, hyde=False):
    '''
    Args:
        url (str): URL of the Qdrant server
        api_key (str): API key for Qdrant
        collection_name (str): Name of the collection to search for chunks
        embedding_model_name (str): Name of the model to be used for embeddings
        query (str): Query text
        topic (str): Topic to filter the chunks

    Returns:

    '''
    eval_dict = {
        "question": [],
        "ground_truth": [],
        "retrieved_contexts": [],
        "answer": []
    }

    len_dataset = len(eval_dataset)

    hypothesis = []

    for index, row in eval_dataset.iterrows():
        # Print the progress
        print(f"Row {index + 1}/{len_dataset}", flush=True, end="\r")
        eval_dict["question"].append(row["question"])
        eval_dict["ground_truth"].append(row["ground_truth"])
        query = row["question"]
        topic = row["topic"]
        hyde_response = None
        if hyde:
            hyde_response = hypothesis_pipeline(llm_name, query)
            hypothesis.append(hyde_response)
        retrieved_documents = retrieval_pipeline(url, api_key, collection_name, embedding_model_name, query, topic, hyde_response)
        eval_dict["retrieved_contexts"].append(retrieved_documents)

        llm_answer = generation_pipeline(retrieved_documents, query, llm_name)
        eval_dict["answer"].append(llm_answer)

        # Sleep 30s
        # time.sleep(30)
    
    if hyde:
        eval_dict["hypothesis"] = hypothesis
    
    print(f"Answer generation is finished.")

    return eval_dict

# Running Answer Generation

In [None]:
def main():
    load_dotenv()
    # Load RAG_config.json
    with open('../../../config/eval_configs_demo.json') as f:
        configs = json.load(f)
    
    # Load dataset untuk evaluasi
    eval_dataset = load_evaluation_dataset('../../../data/evaluation_dataset_demo.csv')

    # Mengambil URL dan API key untuk terhubung ke Qdrant
    url = os.getenv("QDRANT_URL")
    api_key = os.getenv("QDRANT_API_KEY")

    answer_duration = {
        "config": [],
        "duration": []
    }

    for config in configs:
        if config['answer_finished'] == False:
            config_name = config["name"]
            collection_name = config["collection"]
            embedding_model_name = config["embedding_model"]
            llm_name = config["llm_model"]
            is_hyde = config["hyde"]
            print(f"Generating answer for {config_name}")
            print(f"Collection      : {collection_name}")
            print(f"Embedding Model : {embedding_model_name}")
            print(f"LLM Model       : {llm_name}")
            print(f"HyDE            : {is_hyde}\n------------------------------------------------------------")

            start_time = time.time()
            
            answer_json = generate_answer_dataset(url, api_key, collection_name, embedding_model_name, llm_name, eval_dataset, hyde=is_hyde)
            file_name = f"answer_{config_name}_demo.json"
            with open(f'../../../data/{file_name}', 'w') as f:
                json.dump(answer_json, f, indent=4)
            
            config['answer_finished'] = True
            with open(f'../../../config/eval_configs_demo.json', 'w') as f:
                json.dump(configs, f, indent=4)

            end_time = time.time()

            duration = end_time - start_time

            print(f"Answer saved to '{file_name}'. Duration: {duration:.2f} seconds\n")
            
            # Simpan durasi evaluasi ke dalam dictionary
            answer_duration["config"].append(config_name)
            answer_duration["duration"].append(duration)  

            # Simpan durasi evaluasi ke dalam file CSV
            duration_df = pd.DataFrame(answer_duration)
            duration_df.to_csv(f'../../../data/answer_duration_{config_name}_demo.csv', index=False)

main()

Generating answer for combination_1
Collection      : cvd_collection_v1
Embedding Model : nomic-embed-text
LLM Model       : llama3.1
HyDE            : False
------------------------------------------------------------
Row 1/1

KeyboardInterrupt: 

In [None]:
url = os.getenv("QDRANT_URL")
api_key = os.getenv("QDRANT_API_KEY")
collection_name = "cvd_collection_v1"
# collection_name = "cvd_collection_v2"

embedding_model_name = "nomic-embed-text"
# embedding_model_name = "mxbai-embed-large"
llm_name = "llama3.1"

query = "Apa saja hal yang dapat menyebabkan jantung berdebar?"
topic = "Jantung Berdebar"

# RAG Evaluation

In [59]:
def evaluate_answers(config_name, llm_name, embedding_name, hyde=False):
    """
    Args:
        config_name (str): Name of the configuration to be evaluated.
        llm_name (str): Name of the LLM model to be used for evaluation.
        embedding_name (str): Name of the embedding model to be used for evaluation.
        hyde (bool): Whether to use HyDE or not.
    
    Returns:
        None
    """

    # Load jawaban dari file JSON
    file_path = f'../../../data/answer_{config_name}_demo.json'
    with open(file_path) as f:
        answer_dataset = json.load(f)

    load_dotenv()
    # Instansiasi LLM dan embeddings
    llm = ChatOpenAI(model=llm_name, openai_api_key=os.getenv("OPENAI_API_KEY"))
    embeddings = LangchainEmbeddingsWrapper(OpenAIEmbeddings(model=embedding_name, openai_api_key=os.getenv("OPENAI_API_KEY")))
    
    # Evaluasi jawaban
    eval_results = {
        "question": [],
        "ground_truth": [],
        "retrieved_contexts": [],
        "answer": []
    }

    if hyde:
        eval_results["hypothesis"] = []

    eval_results["faithfulness"] = []
    eval_results["answer_relevancy"] = []
    eval_results["context_precision"] = []
    eval_results["context_recall"] = []

    # Mengambil informasi index terakhir dari hasil evaluasi sebelumnya
    # Jika file CSV tidak ada, maka mulai dari index 0
    # Hal ini agar evaluasi tidak terulang dari awal jika sudah pernah dievaluasi sebelumnya
    try:
        df = pd.read_csv(f'../../../data/evaluation_result_{config_name}_demo.csv')
        last_idx = len(df)
    except FileNotFoundError:
        last_idx = 0

    # Ubah jawaban menjadi format Dataset dari Hugging Face
    for i in range(len(answer_dataset["answer"])):
        if i >= last_idx:
            # Print the progress
            print(f"Evaluating row {i + 1}/{len(answer_dataset['answer'])}", flush=True, end="\r")
            answer_row = {
                "question": [answer_dataset["question"][i]],
                "ground_truth": [answer_dataset["ground_truth"][i]],
                "retrieved_contexts": [answer_dataset["retrieved_contexts"][i]],
                "answer": [answer_dataset["answer"][i]],
            }

            if hyde:
                answer_row["hypothesis"] = [answer_dataset["hypothesis"][i]]

            # Melakukan penilaian terhadap jawaban dan konteks yang diambil
            dataset = Dataset.from_dict(answer_row)
            score = evaluate(
                dataset,
                metrics=[
                    faithfulness,
                    answer_relevancy,
                    context_precision,
                    context_recall
                ],
                llm=llm,
                embeddings=embeddings,
            )

            # Memasukkan hasil evaluasi ke dalam eval_results
            eval_results["question"].append(answer_dataset["question"][i])
            eval_results["ground_truth"].append(answer_dataset["ground_truth"][i])
            eval_results["retrieved_contexts"].append(answer_dataset["retrieved_contexts"][i])
            eval_results["answer"].append(answer_dataset["answer"][i])

            if hyde:
                eval_results["hypothesis"].append(answer_dataset["hypothesis"][i])
            
            eval_results["faithfulness"].append(score['faithfulness'][0])
            eval_results["answer_relevancy"].append(score['answer_relevancy'][0])
            eval_results["context_precision"].append(score['context_precision'][0])
            eval_results["context_recall"].append(score['context_recall'][0])

            # Simpan hasil evaluasi ke dalam file CSV
            # Hal ini dilakukan tiap iterasi untuk menghindari kehilangan data jika terjadi kesalahan
            df = pd.DataFrame(eval_results)

            df = df.applymap(lambda x: str(x).replace('\n', '\\n'))

            df.to_csv(f'../../../data/evaluation_result_{config_name}_demo.csv', index=False, sep=',')

In [61]:
def main():
    load_dotenv()
    # Load RAG_config.json
    with open('../../../config/eval_configs_demo.json') as f:
        configs = json.load(f)

    # Instansiasi nama LLM dan embeddings
    llm_eval = "gpt-4o-mini"
    embedding_eval = "text-embedding-3-small"

    for config in configs:
        if config['eval_finished'] == False:
            config_name = config["name"]
            collection_name = config["collection"]
            embedding_model_name = config["embedding_model"]
            llm_name = config["llm_model"]
            is_hyde = config["hyde"]
            print(f"Evaluating {config_name}")
            print(f"Collection      : {collection_name}")
            print(f"Embedding Model : {embedding_model_name}")
            print(f"LLM Model       : {llm_name}")
            print(f"HyDE            : {is_hyde}\n------------------------------------------------------------")

            evaluate_answers(config_name, llm_eval, embedding_eval, hyde=is_hyde)
            
            file_name = f"evaluation_result_{config_name}_demo.csv"
            
            config['eval_finished'] = True
            with open(f'../../../config/eval_configs_demo.json', 'w') as f:
                json.dump(configs, f, indent=4)

            print(f"Evaluation result saved to '{file_name}'.\n")

main()

Evaluating combination_4
Collection      : cvd_collection_v2
Embedding Model : mxbai-embed-large
LLM Model       : deepseek-v2
HyDE            : False
------------------------------------------------------------
Evaluating row 2/2

Evaluating:   0%|          | 0/4 [00:00<?, ?it/s]

Evaluation result saved to 'evaluation_result_combination_4_demo.csv'.



# RAGAS Evaluation

In [31]:
# open answer_combination_1.json
with open('../../../data/answer_combination_1.json') as f:
    eval_dict = json.load(f)

# get the first row of eval_dict
eval_dict = {
    "question": eval_dict["question"][:2],
    "ground_truth": eval_dict["ground_truth"][:2],
    "retrieved_contexts": eval_dict["retrieved_contexts"][:2],
    "answer": eval_dict["answer"][:2],
}

eval_dict

{'question': ['Apa hal-hal yang dapat menyebabkan penyakit hipertensi?',
  'Apa saja gejala hipertensi?'],
 'ground_truth': ['Hipertensi sekunder bisa disebabkan oleh sejumlah kondisi, yaitu:\\n\\nPenyakit ginjal\\nPenyakit kelenjar tiroid\\nTumor kelenjar adrenal\\nKelainan bawaan pada pembuluh darah\\nKecanduan alkohol\\nPenyalahgunaan NAPZA\\nGangguan pernapasan yang terjadi saat tidur (sleep apnea)\\nKonsumsi obat-obatan tertentu, seperti obat antiinflamasi nonsteroid (NSAID), obat batuk pilek, atau pil KB',
  'Gejala yang dapat muncul pada kondisi tersebut adalah:\\n\\nMual dan muntah\\nSakit kepala\\nMimisan\\nSesak napas\\nNyeri dada\\nGangguan penglihatan\\nTelinga berdenging\\nGangguan irama jantung\\nDarah dalam urine'],
 'retrieved_contexts': [['Topik: Hipertensi, Subtopik: Penyebab Hipertensi \n Hipertensi dibagi menjadi hipertensi primer dan sekunder. Hipertensi primer berkembang selama bertahun-tahun dan tidak diketahui penyebabnya secara pasti. Hipertensi primer merupaka

In [32]:
# save eval_dict as answer_combination_4_demo.json
with open('../../../data/answer_combination_4_demo.json', 'w') as f:
    json.dump(eval_dict, f, indent=4)

In [26]:
eval_dataset = Dataset.from_dict(eval_dict)

eval_dataset

Dataset({
    features: ['question', 'ground_truth', 'retrieved_contexts', 'answer'],
    num_rows: 1
})

In [None]:
load_dotenv()
evaluator_llm = ChatOpenAI(model="gpt-4o-mini", openai_api_key=os.getenv("OPENAI_API_KEY"))

evaluator_embeddings = LangchainEmbeddingsWrapper(OpenAIEmbeddings(model="text-embedding-3-small", openai_api_key=os.getenv("OPENAI_API_KEY")))

In [28]:
score = evaluate(eval_dataset, metrics=[faithfulness, answer_relevancy, context_precision, context_recall], llm=evaluator_llm, embeddings=evaluator_embeddings)

Evaluating:   0%|          | 0/4 [00:00<?, ?it/s]

In [29]:
score

{'faithfulness': 1.0000, 'answer_relevancy': 0.5505, 'context_precision': 1.0000, 'context_recall': 1.0000}

In [48]:
type(score)

ragas.dataset_schema.EvaluationResult

In [53]:
score['faithfulness'][0]

1.0

In [5]:
# merged from answer_duration_1_demo.csv to answer_duration_8_demo.csv
with open('../../../config/eval_configs.json') as f:
    configs = json.load(f)

answer_duration = {
    "config": [],
    "collection": [],
    "embedding_model": [],
    "llm_model": [],
    "hyde": [],
    "duration": []
}

configs_fix_time = ['combination_6', 'combination_7', 'combination_8']

for config in configs:
    config_name = config["name"]
    collection_name = config["collection"]
    embedding_model_name = config["embedding_model"]
    llm_name = config["llm_model"]
    is_hyde = config["hyde"]

        
    
    # open 
    df = pd.read_csv(f'../../../data/answer_duration_{config_name}.csv')
    duration = df["duration"].values[0]

    if config_name not in configs_fix_time:
        duration -= 3000

    answer_duration["config"].append(config_name)
    answer_duration["collection"].append(collection_name)
    answer_duration["embedding_model"].append(embedding_model_name)
    answer_duration["llm_model"].append(llm_name)
    answer_duration["hyde"].append(is_hyde)
    answer_duration["duration"].append(duration)

# save to csv
answer_duration_df = pd.DataFrame(answer_duration)
answer_duration_df.to_csv(f'../../../data/answer_duration.csv', index=False, sep=',')

In [None]:
# generator_llm = LangchainLLMWrapper(OllamaLLM(model="llama3.2", temperature=0.4))

# generator_embeddings = LangchainEmbeddingsWrapper(OllamaEmbeddings(model="nomic-embed-text"))