In [None]:
import os
from typing import Any, Dict
from tqdm import tqdm
import pandas as pd
from pprint import pprint
from langchain_anthropic import ChatAnthropic
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_huggingface import HuggingFaceEmbeddings
from ragas.llms import LangchainLLMWrapper
from ragas.embeddings import LangchainEmbeddingsWrapper
from ragas import evaluate, EvaluationDataset, RunConfig
from ragas.metrics import AnswerAccuracy, ResponseRelevancy, RougeScore

## **Preparation**

In [None]:
OUTPUT_PATH = os.path.join("results", "end_to_end_graph_rag")
DATASET_PATH = os.path.join("data", "testing_dataset.xlsx")

os.makedirs(OUTPUT_PATH, exist_ok=True)

df = pd.read_excel(DATASET_PATH)
dataset = []

for idx, row in df.iterrows():
    if row["is_valid"]:
        dataset.append(
            {
                "user_input": str(row["user_input"]),
                "reference": str(row["reference"]),
            }
        )

evaluation_dataset = EvaluationDataset.from_list(dataset)

len(evaluation_dataset)

## **Evaluation**

In [None]:
CLAUDE_LLM_MODEL_NAME = "claude-3-5-haiku-20241022"
GEMINI_LLM_MODEL_NAME = "gemini-2.0-flash"
EMBEDDING_MODEL_NAME = "intfloat/multilingual-e5-large"

claude_llm = ChatAnthropic(
    model_name=CLAUDE_LLM_MODEL_NAME,
    max_tokens_to_sample=4096,
    temperature=0.0,
    timeout=None,
    api_key=os.environ["ANTHROPIC_API_KEY"],
)

gemini_llm = ChatGoogleGenerativeAI(
    model=GEMINI_LLM_MODEL_NAME,
    temperature=0.0,
    timeout=None,
    api_key=os.environ["GOOGLE_API_KEY"],
)

llm_evaluator = gemini_llm
embedding_evaluator = HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL_NAME)

In [None]:
def save_experiment_dataset_or_result(
    dataset,
    experiment_name
) -> None:
    dataset.to_pandas().to_json(
        os.path.join(OUTPUT_PATH, f"{experiment_name}.json"),
        orient="records",
    )

def run_test_case(test_case: Dict[str, Any]) -> Dict[str, Any]:
    experiment_name = (
        f"{test_case['llm_model'].model}_non_rag"
    ).replace("/", "-").replace(":", "-")

    for data in tqdm(
        iterable=evaluation_dataset,
        desc=f"Running LLM non-RAG: `{experiment_name}`",
        disable=False,
    ):    
        response = test_case["llm_model"].invoke(data.user_input)
        data.response = str(response.content)

    # Checkpoint 1
    save_experiment_dataset_or_result(evaluation_dataset, experiment_name)

    run_config = RunConfig(timeout=None)

    evaluation_result = evaluate(
        dataset=EvaluationDataset.from_list(dataset),
        metrics=[
            RougeScore(rouge_type="rougeL", mode="fmeasure", name="rougeL_fmeasure"),
            ResponseRelevancy(),
            AnswerAccuracy(),
        ],
        llm=LangchainLLMWrapper(llm_evaluator, run_config=run_config),
        embeddings=LangchainEmbeddingsWrapper(embedding_evaluator, run_config=run_config),
        experiment_name=experiment_name,
        run_config=run_config,
    )

    # Checkpoint 2
    save_experiment_dataset_or_result(evaluation_result, experiment_name)

    return {
        "experiment_name": experiment_name,
        "args": {"llm": test_case["llm_model"].model},
        "evaluation_result": evaluation_result,
    }

In [None]:
test_cases = [
    {"llm_model": claude_llm},
]

### **Test Case 1**

- Claude 3.5 Haiku

In [None]:
test_result_1 = run_test_case(test_cases[0])

In [None]:
pprint(test_result_1)