In [None]:
import os
import sys
from ast import literal_eval
from typing import Any, Dict, List


sys.path.append(os.path.join(os.getcwd(), ".."))


import pandas as pd
from pprint import pprint
from dotenv import load_dotenv
from langchain_neo4j import Neo4jGraph
from langchain_huggingface import HuggingFaceEmbeddings
from ragas.dataset_schema import EvaluationDataset, EvaluationResult
from src.grag import run_vector_cypher_workflow, evaluate_retriever


load_dotenv()

# **Preparation**

In [None]:
OUTPUT_PATH = os.path.join("results", "vector_cypher_retriever")
DATASET_PATH = os.path.join("data", "testing_dataset.xlsx")

os.makedirs(OUTPUT_PATH, exist_ok=True)

df: pd.DataFrame = pd.read_excel(DATASET_PATH)
dataset: List[Dict[str, Any]] = []

for idx, row in df.iterrows():
    if row["is_valid"]:
        dataset.append(
            {
                "user_input": str(row["user_input"]),
                "reference_contexts": literal_eval(row["reference_contexts_1"])
            }
        )

evaluation_dataset = EvaluationDataset.from_list(dataset)

len(evaluation_dataset)

# **Evaluation**

In [None]:
URI = os.environ["NEO4J_HOST"]
USERNAME = os.environ["NEO4J_USERNAME"]
PASSWORD = os.environ["NEO4J_PASSWORD"]


def save_experiment_result(
    evaluation_result: EvaluationResult, experiment_name: str
) -> None:
    evaluation_result.to_pandas().to_json(
        os.path.join(OUTPUT_PATH, f"{experiment_name}.json"),
        orient="records",
    )


def run_test_case(test_case: Dict[str, Any]) -> List[Dict[str, Any]]:
    test_result = []

    neo4j_graph = Neo4jGraph(
        url=URI,
        username=USERNAME,
        password=PASSWORD,
        database=test_case["database_name"],
        enhanced_schema=True
    )

    embedding_model = HuggingFaceEmbeddings(
        model_name=test_case["embedding_model_name"]
    )

    for k in range(3, 16):
        experiment_name = (
            f"{test_case['database_name']}_{test_case['embedding_model_name']}"
            f"_init-{k}".replace("/", "-")
        )

        evaluation_dataset_completed = run_vector_cypher_workflow(
            evaluation_dataset,
            experiment_name,
            neo4j_graph=neo4j_graph,
            embedder_model=embedding_model,
            top_k_initial_article=k,
        )

        evaluation_result = evaluate_retriever(
            evaluation_dataset_completed,
            experiment_name=experiment_name,
        )

        # Checkpoint
        save_experiment_result(evaluation_result, experiment_name=experiment_name)

        test_result.append(
            {
                "experiment_name": experiment_name,
                "args": {
                    "database": test_case["database_name"],
                    "embedding_model": test_case["embedding_model_name"],
                    "k": k,
                },
                "evaluation_result": evaluation_result,
            }
        )

    return test_result

In [None]:
test_cases = [
    {
        "database_name": "db-small",
        "embedding_model_name": "all-MiniLM-L6-v2"
    },
    {
        "database_name": "db-large",
        "embedding_model_name": "intfloat/multilingual-e5-large"
    },
    {
        "database_name": "test",
        "embedding_model_name": "archi-ai/Indo-LegalBERT"
    }
]

## **Test Case 1**

- db-small
- all-MiniLM-L6-v2

In [None]:
test_result_1 = run_test_case(test_cases[0])

In [None]:
pprint(test_result_1)

## **Test Case 2**

- db-large
- intfloat/multilingual-e5-large

In [None]:
test_result_2 = run_test_case(test_cases[1])

In [None]:
pprint(test_result_2)

## **Test Case 3**

- db-domain-specific
- archi-ai/Indo-LegalBERT

In [None]:
test_result_3 = run_test_case(test_cases[2])

In [None]:
pprint(test_result_3)