# Evaluation with Ragas

In [1]:
import sys
import os

sys.path.append(os.path.abspath("../"))

In [2]:
from psycopg_pool import AsyncConnectionPool
from langgraph.checkpoint.postgres.aio import AsyncPostgresSaver

In [3]:
from app.retrieval.vector_store import vector_store_service
from app.core.database import session_manager, pgvector_session_manager
from app.env import DATABASE_URL
from app.retrieval.chain import chain_service

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
async def setup_environment():
    await session_manager.initialize()
    vector_store_service.initialize_embedding_model('../data/sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2')
    vector_store_service.initialize_pg_vector()
    await pgvector_session_manager.initialize()

    DB_URI = f"postgresql://{DATABASE_URL}?sslmode=disable"
    connection_kwargs = {
        "autocommit": True,
        "prepare_threshold": 0,
    }

    pool = AsyncConnectionPool(conninfo=DB_URI, max_size=20, kwargs=connection_kwargs)
    await pool.__aenter__()  # manually enter the async context

    checkpointer = AsyncPostgresSaver(pool)
    await checkpointer.setup()
    chain_service.set_checkpointer(checkpointer)

    return pool  # Keep reference to close later

# Run setup once in the notebook
pool = await setup_environment()

Initialize embedding model...
../data/sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2
Successfully initialize embedding model


In [None]:
from ragas.metrics import (
    answer_relevancy,
    faithfulness,
    context_recall,
    context_precision,
    answer_correctness,
    answer_similarity
)
from tqdm import tqdm
from langchain.schema import HumanMessage, AIMessage, SystemMessage
from langchain.schema.messages import ToolMessage
from pprint import pprint
from app.retrieval.chain import chain_service

In [None]:
evaluation_data = []

questions = [
    "Siapa kamu?",
    "Berikan jadwal perkuliahan di hari Senin",
    "Siapa saja dosen yang ada di departemen teknologi informasi?",
    "Siapa saja staff tata usaha yang ada di departemen teknologi informasi?"
]

agent = chain_service.create_agent("openai")

expected_answers = {
    "Siapa kamu?": "Saya adalah CATI, asisten virtual yang membantu pengguna dengan menjawab pertanyaan seputar administrasi dan informasi mengenai Departemen Teknologi Informasi di Institut Teknologi Sepuluh Nopember. Saya dikembangkan oleh mahasiswi di Departemen Teknologi Informasi, Midyanisa Yuniar, sebagai bagian dari Tugas Akhir.",
    "Berikan jadwal perkuliahan di hari Senin": """*   *07.00-07.55*: Arsitektur Enterprise C (3 SKS) - Semester 2
    *   *07.00-07.55*: Struktur Data dan Pemograman Berorientasi Objek B (3 SKS) - Semester 2
    *   *07.00-07.55*: Pengembangan Sistem/Aplikasi (Capstone Project) 2022 A - Semester 6
    *   *08.00-08.55*: (2018-2021) - Semester 8
    *   *09.00-09.55*:
    *   *10.00-10.55*: Arsitektur Enterprise A (3 SKS) - Semester 2
    *   *10.00-10.55*: (2022) - Semester 6
    *   *11.00-11.55*:
    *   *12.00-12.55*:
    *   *12.00-12.55*: (2023) - Semester 4
    *   *13.30-14.20*: Sistem Operasi A (4 SKS) - Semester 2
    *   *13.30-14.20*: Struktur Data dan Pemograman Berorientasi Objek C (3 SKS) - Semester 2
    *   *13.30-14.20*: Arsitektur Enterprise B (3 SKS) - Semester 2
    *   *13.30-14.20*: Kriptografi B (3 SKS) - Semester 6
    *   *14.30-15.20*:
    *   *14.30-15.20*: (2024) - Semester 2
    *   *15.30-16.20*:
    """,
    "Siapa saja dosen yang ada di departemen teknologi informasi?": """
    *   Dr.tech.Ir. Raden Venantius Hari Ginardi, M.Sc (Kepala Departemen)
    *   Ir. Muchammad Husni, M.Kom
    *   Dr. Ir. Henning Titi Ciptaningtyas, S.Kom, M.Kom.
    *   Ridho Rahman Hariadi, S.Kom., M.Sc.
    *   Hatma Suryotrisongko, S.Kom., M.Eng., Ph.D.
    *   Annisaa Sri Indrawanti, S. Kom., M. Kom
    *   Dr. Rizka Wakhidatus Sholikah, S. Kom
    *   Irzal Ahmad Sabilla, S. Kom.,M.Kom
    *   Irzal Ahmad Sabilla, S. Kom.,M.Kom
    *   Fuad Dary Rosyadi, S.Kom., M.Kom. 
    *   Hafara Firdausi, S.Kom., M.Kom. 
    """,
    "Siapa saja staff tata usaha yang ada di departemen teknologi informasi?": """
    *  Khairun Nasihin, S.ST. 
    *  Lila Ambarwati, S.Pd. 
    *  Jihan Fadhilah, S.M. 
    *  Anggun Anisafitri, S.Ak. 
    *  Dimas Bahtiyar, A.Md.T.
    *  Rizky Maulana
    """
    # Add more expected answers for each question
}

count = 0

# for q in tqdm(questions):
for q in questions:
    # Execute the agent to get the answer
    print(f"Processing question: {q}")
    
    # Execute the agent to get the answer
    try:
        result = await agent.ainvoke(
            {
                "messages": [
                    SystemMessage(content=f"User ID atau sender pesan adalah: user-test"), 
                    HumanMessage(content=str(q))
                ],
            }, 
            {"configurable": {"thread_id": f"abc-{count}"}}
        )
        
        print("bisa kah sampe sini?")
        
        # answer = result["messages"][-1].content
        # contexts = retrieved_logs.get(q, [])
        print('result')
        pprint(result)
        tool_messages = [
        message for message in result["messages"]
            if isinstance(message, ToolMessage)
        ]

        # Ambil isi ToolMessage terakhir
        if tool_messages:
            raw_content = tool_messages[-1].content
            contexts = [raw_content]  # bisa juga di-parse lebih lanjut jika ingin struktur yang lebih bersih
        else:
            contexts = []
            
        print('contexts')
        print(contexts)
        
        messages = result["messages"]
        ai_messages = [
                message.content
                for message in messages
                if isinstance(message, AIMessage) and message.content.strip() != ""
            ]

        answer = (
                ai_messages[-1]
                if ai_messages
                else "Terjadi kesalahan, tidak ada respon dari AI. Tolong hubungi developer."
            )
        
        # Get the ground truth for this question
        ground_truth = expected_answers.get(q, "No expected answer provided")

        # Add the data to evaluation_data
        evaluation_data.append({
            "question": q,
            "contexts": contexts,
            "response": answer,
            "ground_truth": ground_truth,  # Add ground truth here
        })
        
        count = count + 1
    except Exception as e:
        print(f"Error processing question '{q}': {e}\n")


print('evaluation_data')
print(evaluation_data)

Processing question: Siapa kamu?
bisa kah sampe sini?
result
{'messages': [SystemMessage(content='User ID atau sender pesan adalah: user-test', additional_kwargs={}, response_metadata={}, id='87212502-13e1-4d37-9b63-adebd9e1b6a6'),
              HumanMessage(content='Siapa kamu?', additional_kwargs={}, response_metadata={}, id='90769de6-2f73-479b-82d0-5a0df8f28526'),
              AIMessage(content='Saya adalah CATI, asisten virtual yang dikembangkan oleh mahasiswa di Departemen Teknologi Informasi di Institut Teknologi Sepuluh Nopember. Saya di sini untuk membantu Anda dengan pertanyaan seputar administrasi dan informasi terkait Departemen Teknologi Informasi. Jika ada yang ingin Anda ketahui, silakan bertanya!\n\nAdakah feedback atau saran yang dapat Anda berikan mengenai chatbot berbasis Agentic RAG ini?', additional_kwargs={'refusal': None}, response_metadata={'token_usage': {'completion_tokens': 91, 'prompt_tokens': 456, 'total_tokens': 547, 'completion_tokens_details': {'accepted

In [8]:
from datasets import Dataset as HFDataset
from ragas import evaluate

In [9]:
ragas_dataset = HFDataset.from_list(evaluation_data)

result = evaluate(
    ragas_dataset,
    metrics=[
        context_precision,
        faithfulness,
        answer_relevancy,
        context_recall,
        answer_correctness,
        answer_similarity
    ]
)

result.to_pandas()

Evaluating: 100%|██████████| 24/24 [00:55<00:00,  2.33s/it]


Unnamed: 0,user_input,retrieved_contexts,response,reference,context_precision,faithfulness,answer_relevancy,context_recall,answer_correctness,semantic_similarity
0,Siapa kamu?,[],"Saya adalah CATI, asisten virtual yang diranca...","Saya adalah CATI, asisten virtual yang membant...",0.0,0.0,0.727426,0.0,0.541052,0.964209
1,Berikan jadwal perkuliahan di hari Senin,"[Source: {'page': 0, 'title': 'PUBLISH Mahasis...",Berikut adalah jadwal perkuliahan untuk hari S...,* *07.00-07.55*: Arsitektur Enterprise C (3 ...,1.0,0.714286,0.788659,0.470588,0.730765,0.923061
2,Siapa saja dosen yang ada di departemen teknol...,"[Source: {'page': 1, 'title': 'Daftar Dosen - ...",Berikut adalah beberapa dosen yang ada di Depa...,\n * Dr.tech.Ir. Raden Venantius Hari Gin...,1.0,0.916667,0.867356,0.636364,0.786911,0.897645
3,Siapa saja staff tata usaha yang ada di depart...,"[Source: {'page': 0, 'author': 'Midyanisa Yuni...",Berikut ini adalah daftar staff tata usaha di ...,"\n * Khairun Nasihin, S.ST. \n * Lila ...",1.0,0.928571,0.900752,1.0,0.684466,0.843126


In [None]:
await session_manager.close()
await pgvector_session_manager.close()
await pool.__aexit__(None, None, None)