In [None]:
import os
import sys
from ast import literal_eval
from typing import Any, Dict, Union


sys.path.append(os.path.join(os.getcwd(), ".."))


import pandas as pd
from pprint import pprint
from dotenv import load_dotenv
from langchain_ollama import ChatOllama
from langchain_anthropic import ChatAnthropic
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_huggingface import HuggingFaceEmbeddings
from ragas.dataset_schema import EvaluationDataset, EvaluationResult
from src.grag import run_text_generation_workflow, evaluate_text_generation


load_dotenv()

## **Preparation**

In [None]:
OUTPUT_PATH = os.path.join("results", "llm_text_generation")
DATASET_PATH = os.path.join("data", "testing_dataset.xlsx")

os.makedirs(OUTPUT_PATH, exist_ok=True)

df = pd.read_excel(DATASET_PATH)
dataset = []
expected_tool_call_names = []
generated_cypher_results = []

for idx, row in df.iterrows():
    if row["is_valid"]:
        dataset.append(
            {
                "user_input": str(row["user_input"]),
                "reference": str(row["reference"]),
                "retrieved_contexts": literal_eval(row["reference_contexts_1"]),
            }
        )
        expected_tool_call_names.append(str(row["reference_tool_call"]))
        generated_cypher_results.append(str(row["cypher_reference"]))

evaluation_dataset = EvaluationDataset.from_list(dataset)

len(evaluation_dataset)

## **Evaluation**

In [None]:
CLAUDE_LLM_MODEL_NAME = "claude-3-5-haiku-20241022"
LLAMA_LLM_MODEL_NAME = "llama3.1:8b-instruct-q4_K_M"
GEMINI_LLM_MODEL_NAME = "gemini-2.0-flash"
EMBEDDING_MODEL_NAME = "intfloat/multilingual-e5-large"

claude_llm = ChatAnthropic(
    model_name=CLAUDE_LLM_MODEL_NAME,
    max_tokens_to_sample=4096,
    temperature=0.0,
    timeout=None,
    api_key=os.environ["ANTHROPIC_API_KEY"],
)

llama_llm = ChatOllama(
    model=LLAMA_LLM_MODEL_NAME,
    num_ctx=32768,
    num_predict=4096,
    temperature=0.0,
)

llm_evaluator = ChatGoogleGenerativeAI(
    model=GEMINI_LLM_MODEL_NAME,
    temperature=0.0,
    timeout=None,
    api_key=os.environ["GOOGLE_API_KEY"],
)

embedding_evaluator = HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL_NAME)

In [None]:
def save_experiment_dataset_or_result(
    dataset: Union[EvaluationDataset, EvaluationResult],
    experiment_name: str
) -> None:
    dataset.to_pandas().to_json(
        os.path.join(OUTPUT_PATH, f"{experiment_name}.json"),
        orient="records",
    )


def run_test_case(test_case: Dict[str, Any]) -> Dict[str, Any]:
    experiment_name = test_case["llm_model"].model.replace(":", "-")

    evaluation_dataset_completed = run_text_generation_workflow(
        evaluation_dataset,
        experiment_name,
        expected_tool_call_names=expected_tool_call_names,
        generated_cypher_results=generated_cypher_results,
        llm=test_case["llm_model"],
        verbose=True
    )

    # Checkpoint 1
    save_experiment_dataset_or_result(
        evaluation_dataset_completed,
        experiment_name=experiment_name
    )

    evaluation_result = evaluate_text_generation(
        evaluation_dataset_completed,
        llm_model=llm_evaluator,
        embedding_model=embedding_evaluator,
        experiment_name=experiment_name,
    )

    # Checkpoint 2
    save_experiment_dataset_or_result(
        evaluation_result,
        experiment_name=experiment_name
    )

    return {
        "experiment_name": experiment_name,
        "args": {"llm": test_case["llm_model"].model},
        "evaluation_result": evaluation_result,
    }

In [None]:
test_cases = [
    # Llama (local)
    {"llm_model": llama_llm},
    # Claude (API)
    {"llm_model": claude_llm},
]

### **Test Case 1**

- Llama 3.1 8B Instruct

In [None]:
test_result_1 = run_test_case(test_cases[0])

In [None]:
pprint(test_result_1)

### **Test Case 2**

- Claude 3.5 Haiku

In [None]:
test_result_2 = run_test_case(test_cases[1])

In [None]:
pprint(test_result_2)