In [44]:
import os
import pandas as pd

from langchain.vectorstores import FAISS

from langchain_community.embeddings import HuggingFaceEmbeddings
from smolagents import models, LiteLLMModel, ToolCallingAgent, LogLevel

from src.utils import get_openai_api_key
from src.vector_db_cfg import (
    sports_cfg,
    finance_cfg,
    movie_cfg,
    unified_cfg,
)
from src.retrieval_tool import RetrieverTool
from src.evaluation import evaluate_rag_response

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [19]:
data_dir = os.path.join('.', 'data')

# 1. Connect to LLM

Connect to LLM using OpenAI api.

Create a `llm.env` in root folder:
```
OPENAI_API_KEY='<your-openai-api-key>'
```

In [2]:
# Load openai api key into environment
get_openai_api_key()

In [3]:
LLM_MODEL_ID = "o3-mini"

llm = LiteLLMModel(model_id=LLM_MODEL_ID)

In [11]:
# test llm
test_msg = models.ChatMessage(role="user", content="hello!")

response = llm.generate([test_msg])

print(response.content)
print("model_id:", response.raw.model)

Hello there! How can I help you today?
model_id: o3-mini-2025-01-31


# 2. Create Retrieval Tools

In [5]:
# Setup embedding  model
EMBEDDING_MODEL_NAME = "thenlper/gte-small"

embedding_model = HuggingFaceEmbeddings(
    model_name=EMBEDDING_MODEL_NAME,
    multi_process=True,
    model_kwargs={"device": "cuda"},
    encode_kwargs={"normalize_embeddings": True},
)

In [6]:
data_dir = os.path.join('.', 'data')

In [7]:
# load vector database
db_dir = os.path.join(data_dir, "faiss")

sports_vector_db = FAISS.load_local(
    os.path.join(db_dir, "sports"),
    embeddings=embedding_model,
    allow_dangerous_deserialization=True,
)

finance_vector_db = FAISS.load_local(
    os.path.join(db_dir, "finance"),
    embeddings=embedding_model,
    allow_dangerous_deserialization=True,
)

movie_vector_db = FAISS.load_local(
    os.path.join(db_dir, "movie"),
    embeddings=embedding_model,
    allow_dangerous_deserialization=True,
)

In [8]:
sports_tool = RetrieverTool(vectordb=sports_vector_db, cfg=sports_cfg)
finance_tool = RetrieverTool(vectordb=finance_vector_db, cfg=finance_cfg)
movie_tool = RetrieverTool(vectordb=movie_vector_db, cfg=movie_cfg)

# 3. Create RAG AI Agent

In [10]:
# Using smolagents ToolCallingAgent
agent = ToolCallingAgent(
    model=llm,
    tools=[sports_tool, finance_tool, movie_tool],
    max_steps=10, 
    verbosity_level=LogLevel.ERROR,
)

# 4. Run Agentic RAG

In [12]:
question_metadata = pd.read_csv(os.path.join(data_dir, "question_metadata.csv"))
question_metadata.head(2)

Unnamed: 0,interaction_id,domain,question_type,static_or_dynamic,query,answer
0,47859020-9974-4c81-a897-96594beca8fb,movie,aggregation,static,how many family movies were there that came ou...,109
1,80365e4f-795e-4039-8afb-b7a8e8d54285,movie,post-processing,static,what was the average budget for all movies in ...,"$147,375,000"


In [14]:
# define query prompt template
enhanced_query = """You are a helpful AI assistant.
You solve tasks step-by-step, using tools when needed.

Available tools are three retriever tools:
1. sports_retriever: contains knowledge about sports, such as basketball, football, etc.
2. finance_retriever: contains knowledge about earnings reports, market trends, stock performance, and economic indicators
3. movie_retriever: contains knowledge about film synopses, box office data, cast information, and critical reception.

Follow this loop:
1. Think about the problem.
2. If a tool is needed, call it with the right input.
3. Observe the result.
4. Repeat until you can give a final answer.

You can only directly answer daily conversation questions, such as "hello", "could you help me find questions?"; when answering factual question, you have to use retriever tools to get answers.
If you can not find relevant information from given retriever tools, just response that you did not find relevant information.


User question: {}
"""

In [15]:
agentic_rag_output = []

for _, row in question_metadata.iterrows():
    int_id = row['interaction_id']
    question = row['query']
    query = enhanced_query.format(question)

    response = agent.run(query)
    response_dict = {
        'interaction_id': int_id,
        'agentic_response': response,
    }
    
    agentic_rag_output.append(response_dict)

In [18]:
agentic_rag_output_df = pd.DataFrame(agentic_rag_output)
agentic_rag_output_df.head(2)

Unnamed: 0,interaction_id,agentic_response
0,47859020-9974-4c81-a897-96594beca8fb,I did not find relevant information about the ...
1,80365e4f-795e-4039-8afb-b7a8e8d54285,After reviewing the details from the retrieved...


In [20]:
agentic_rag_output_df.to_csv(
    os.path.join(data_dir, 'agentic_rag_output.csv'), 
    index=False
)

# 5. Run Standard RAG

In [23]:
# create a unified retriever tool
unified_vector_db = FAISS.load_local(
    os.path.join(db_dir, "unified"),
    embeddings=embedding_model,
    allow_dangerous_deserialization=True,
)

unified_retriever = RetrieverTool(vectordb=unified_vector_db, cfg=unified_cfg)

In [35]:
# create a prompt template
prompt_template = """You are a helpful AI assistant.
You answer question based on supporting documents below, give a comprehensive answer to the question.

If you can not find relevant information from given documents, just response that you did not find relevant information.
Do not give answers based on you own knowledge.


User question: {}

Documents:
{}
"""

In [38]:
# collect response from standard RAG
standard_rag_output = []
for _, row in question_metadata.iterrows():
    # get question
    question = row['query']
    int_id = row['interaction_id']
    
    # retrieve context
    context = unified_retriever(question)
    
    # fit context into prompt
    prompt = prompt_template.format(question, context)
    message = [{"role": "user", "content": prompt}]
    
    # generate response based on prompt
    response = llm.generate(message)
    response_dict = {
        'interaction_id': int_id,
        'standard_response': response.content,
    }

    standard_rag_output.append(response_dict)

In [39]:
standard_rag_output_df = pd.DataFrame(standard_rag_output)
standard_rag_output_df.head(2)

Unnamed: 0,interaction_id,standard_response
0,47859020-9974-4c81-a897-96594beca8fb,"I looked through the provided documents, but n..."
1,80365e4f-795e-4039-8afb-b7a8e8d54285,"Based on the documents, especially Document 1,..."


In [40]:
standard_rag_output_df.to_csv(
    os.path.join(data_dir, 'standard_rag_output.csv'), 
    index=False
)

# 6. Setup a Judge Model to Evaluate Answers from Both RAGs

In [41]:
JUDGE_LLM_MODEL_ID = "gpt-4o" # select gpt-4o as judging model
judge_llm = LiteLLMModel(model_id=JUDGE_LLM_MODEL_ID)

test_msg = models.ChatMessage(role="user", content="hello!")
response = judge_llm.generate([test_msg])
print("llm response:", response.content)
print("model_id:", response.raw.model)

llm response: Hello! How can I assist you today?
model_id: gpt-4o-2024-08-06


In [42]:
# prompt and eval criteria for judge model
base_prompt = """You are an impartial evaluator tasked with scoring a model's answer.

You will receive:
1. **Question** – the original user query.
2. **Reference Answer** – the correct, authoritative answer for comparison.
3. **Model Answer** – the answer generated by the system under evaluation.

Your task:
Compare the Model Answer against the Reference Answer ONLY. Do not use external knowledge.

### Scoring Criteria (1–5)

**1. Faithfulness (Factual Accuracy)**
- 5 = Fully matches the reference answer; no incorrect or contradictory information.
- 4 = Mostly matches; one minor deviation that does not change the meaning.
- 3 = Partially correct; includes both correct and incorrect elements.
- 2 = Mostly incorrect; some overlap with the reference but major errors present.
- 1 = Completely incorrect or contradictory.

**2. Relevance**
- 5 = Entire answer addresses the question directly.
- 4 = Mostly relevant; minor irrelevant detail.
- 3 = Partially relevant; significant portion unrelated to the question.
- 2 = Mostly off-topic with only slight relevance.
- 1 = Entirely irrelevant.

**3. Completeness**
- 5 = Fully answers the question; no significant omissions.
- 4 = Minor omission but core answer is complete.
- 3 = Several relevant points missing.
- 2 = Very incomplete; most key points missing.
- 1 = Does not address the core aspects of the question.

---

### Output Format
Return only a valid JSON object:
{
  "faithfulness": <1-5>,
  "relevance": <1-5>,
  "completeness": <1-5>,
  "justification": "Brief explanation comparing the model answer to the reference answer."
}
"""

prompt_template = """
### Question:
{question}

### Refernce Answer:
{reference_answer}

### Model Answer:
{model_answer}
"""

In [43]:
# Merge collected answers from both agentic and standard RAGs
merged_question_metadata = question_metadata.merge(
    agentic_rag_output_df, 
    on=['interaction_id']
).merge(
    standard_rag_output_df,
    on=['interaction_id']
)

merged_question_metadata.head(2)

Unnamed: 0,interaction_id,domain,question_type,static_or_dynamic,query,answer,agentic_response,standard_response
0,47859020-9974-4c81-a897-96594beca8fb,movie,aggregation,static,how many family movies were there that came ou...,109,I did not find relevant information about the ...,"I looked through the provided documents, but n..."
1,80365e4f-795e-4039-8afb-b7a8e8d54285,movie,post-processing,static,what was the average budget for all movies in ...,"$147,375,000",After reviewing the details from the retrieved...,"Based on the documents, especially Document 1,..."


In [45]:
# Collect eval scores
judge_model_output = []

for _, row in merged_question_metadata.iterrows():
    record = {}
    int_id = row['interaction_id']
    question = row['query']
    true_answer = row['answer']
    agentic_answer = row['agentic_response']
    standard_answer = row['standard_response']
    
    record.update({'interaction_id': int_id})

    # eval agentic RAG
    agentic_prompt = base_prompt + prompt_template.format(
        question=question,
        reference_answer=true_answer,
        model_answer=agentic_answer,
    )
    
    agentic_eval_dict = evaluate_rag_response(
        prompt=agentic_prompt,
        judge_llm=judge_llm,
        eval_name="agentic",
    )
    record.update(agentic_eval_dict)
    
    standard_prompt = base_prompt + prompt_template.format(
        question=question,
        reference_answer=true_answer,
        model_answer=standard_answer,
    )
    
    standard_eval_dict = evaluate_rag_response(
        prompt=standard_prompt,
        judge_llm=judge_llm,
        eval_name="standard"
    )
    record.update(standard_eval_dict)

    judge_model_output.append(record)

In [46]:
judge_model_output_df = pd.DataFrame(judge_model_output)
judge_model_output_df.head(2)

Unnamed: 0,interaction_id,agentic_faithfulness,agentic_relevance,agentic_completeness,agentic_justification,standard_faithfulness,standard_relevance,standard_completeness,standard_justification
0,47859020-9974-4c81-a897-96594beca8fb,1,2,1,The model answer does not provide the number o...,2,1,1,The model answer does not provide the exact nu...
1,80365e4f-795e-4039-8afb-b7a8e8d54285,3,5,5,"The model answer is mostly faithful, providing...",4,5,5,The model answer provides a detailed breakdown...


In [48]:
judge_model_output_df.to_csv(
    os.path.join(data_dir, 'judge_model_output.csv'), 
    index=False,
)

In [47]:
avg_metrics = judge_model_output_df[[
    'agentic_faithfulness', 
    'agentic_relevance',
    'agentic_completeness', 
    'standard_faithfulness', 
    'standard_relevance', 
    'standard_completeness',
]].mean()

avg_metrics

agentic_faithfulness     3.02
agentic_relevance        4.04
agentic_completeness     3.46
standard_faithfulness    2.68
standard_relevance       3.70
standard_completeness    2.92
dtype: float64

In [51]:
# metric improvements
print("Agentic RAG improvments:")
for m in ['faithfulness', 'relevance', 'completeness']:
    improved = (avg_metrics[f"agentic_{m}"] - avg_metrics[f"standard_{m}"]) / avg_metrics[f"standard_{m}"]
    improved_pct = round(improved*100, 2)
    print(
        f"- {m}:", 
        f"{improved_pct}%",
    )

Agentic RAG improvments:
- faithfulness: 12.69%
- relevance: 9.19%
- completeness: 18.49%
