# Personal Financial Agent — Evaluation & Synthetic Data Generation

This notebook demonstrates the full evaluation lifecycle for our Romanian Personal Financial Agent,
directly adapted from **AIE9 Sessions 9-10**.

## Structure
1. **Synthetic Data Generation** — Generate test questions from financial documents using RAGAS
2. **RAG Evaluation — Baseline** — Evaluate with naive top-k retrieval
3. **RAG Evaluation — Improved** — Add Cohere reranking and compare scores
4. **Agent Evaluation** — Test tool routing, topic adherence, MiFID II compliance

In [None]:
# Setup & Imports
import os
import sys
import asyncio
import json
import pandas as pd
from IPython.display import display, HTML, Markdown

# Add app to path
sys.path.insert(0, '/app')

from app.config import settings
from app.services.rag_service import rag_service

print(f'OpenAI API Key: {settings.openai_api_key[:8]}...')
print(f'Qdrant: {settings.qdrant_host}:{settings.qdrant_port}')
print(f'Collection: {settings.qdrant_collection}')

## 1. Synthetic Data Generation (SDG)

Using RAGAS `TestsetGenerator` to create synthetic question-answer pairs from our Romanian
financial documents. This follows the AIE9 Session 9 pattern.

The generator creates three types of questions:
- **Simple** — Single-fact retrieval questions
- **Multi-Context** — Questions requiring information from multiple chunks
- **Reasoning** — Questions requiring inference from retrieved information

In [None]:
# Load documents for SDG
from langchain_community.document_loaders import PyMuPDFLoader
from langchain_openai import ChatOpenAI, OpenAIEmbeddings

# Load all PDFs from documents folder
import glob

pdf_files = glob.glob('/app/documents/*.pdf')
documents = []
for pdf in pdf_files:
    loader = PyMuPDFLoader(pdf)
    documents.extend(loader.load())

print(f'Loaded {len(documents)} pages from {len(pdf_files)} PDF files')
for doc in documents[:3]:
    print(f'  - {doc.metadata.get("source", "unknown")}: {doc.page_content[:100]}...')

In [None]:
# Generate synthetic test set
from ragas.testset import TestsetGenerator
from ragas.llms import LangchainLLMWrapper
from ragas.embeddings import LangchainEmbeddingsWrapper

# Setup LLM and embeddings for SDG
generator_llm = LangchainLLMWrapper(ChatOpenAI(model='gpt-4o', api_key=settings.openai_api_key))
generator_embeddings = LangchainEmbeddingsWrapper(OpenAIEmbeddings(
    model=settings.embedding_model,
    api_key=settings.openai_api_key
))

# Create test set generator
generator = TestsetGenerator(
    llm=generator_llm,
    embedding_model=generator_embeddings,
)

# Generate synthetic test set
testset = generator.generate_with_langchain_docs(
    documents=documents,
    testset_size=10,
)

test_df = testset.to_pandas()
print(f'Generated {len(test_df)} synthetic test questions')
display(test_df[['question', 'ground_truth']].head(10))

In [None]:
# If SDG fails (e.g., not enough documents), use manually curated test questions
# This is a fallback that ensures the evaluation can always run

MANUAL_TEST_QUESTIONS = [
    {
        'question': 'Ce sunt titlurile de stat TEZAUR?',
        'ground_truth': 'Titlurile TEZAUR sunt instrumente financiare emise de Ministerul Finantelor din Romania, destinate exclusiv persoanelor fizice rezidente. Au maturitati de 1, 3 sau 5 ani, dobanda fixa, si sunt 100% garantate de statul roman. Sunt scutite de impozit pe venit.',
    },
    {
        'question': 'Care sunt diferentele intre TEZAUR si FIDELIS?',
        'ground_truth': 'TEZAUR nu se tranzactioneaza pe bursa si este scutit de impozit. FIDELIS este listat la BVB, poate fi tranzactionat pe piata secundara, si este impozitat cu 10% din 2023.',
    },
    {
        'question': 'Ce avantaje are TEZAUR fata de depozitele bancare?',
        'ground_truth': 'Nu exista risc de pierdere a capitalului investit. Dobanzile sunt mai mari decat la depozitele bancare. Scutire de impozit pe venit. Accesibile de la 1 RON.',
    },
    {
        'question': 'Cum se pot achizitiona titlurile FIDELIS?',
        'ground_truth': 'FIDELIS sunt listate la BVB si pot fi cumparate sau vandute pe piata secundara. Dobanda fixa, platita semestrial sub forma de cupon.',
    },
    {
        'question': 'Ce maturitati au titlurile de stat romanesti?',
        'ground_truth': 'Titlurile TEZAUR si FIDELIS au maturitati de 1 an, 3 ani sau 5 ani. FIDELIS poate fi denominat in LEI sau EURO.',
    },
]

# Use SDG results if available, otherwise fall back to manual
try:
    if len(test_df) >= 5:
        eval_questions = test_df['question'].tolist()
        eval_ground_truths = test_df['ground_truth'].tolist()
        print(f'Using {len(eval_questions)} SDG-generated questions')
    else:
        raise ValueError('Not enough SDG questions')
except:
    eval_questions = [q['question'] for q in MANUAL_TEST_QUESTIONS]
    eval_ground_truths = [q['ground_truth'] for q in MANUAL_TEST_QUESTIONS]
    print(f'Using {len(eval_questions)} manually curated questions')

for i, q in enumerate(eval_questions, 1):
    print(f'{i}. {q}')

## 2. RAG Evaluation — Baseline (No Reranking)

First, we evaluate the RAG pipeline with naive top-5 similarity search — **no reranking**.
This establishes our baseline scores that we'll improve upon.

In [None]:
# Run baseline RAG evaluation (no reranking)
from datasets import Dataset

async def evaluate_rag(questions, ground_truths, use_reranking=False):
    """Run RAG pipeline and collect results for RAGAS evaluation."""
    answers = []
    contexts = []
    
    llm = ChatOpenAI(model=settings.specialist_model, api_key=settings.openai_api_key)
    
    for question in questions:
        # Retrieve documents
        docs = await rag_service.query(question, use_reranking=use_reranking)
        context_texts = [doc.page_content for doc in docs]
        
        # Generate answer
        context_str = '\n\n'.join(context_texts)
        prompt = f'Based on the following context, answer the question.\n\nContext:\n{context_str}\n\nQuestion: {question}\n\nAnswer:'
        response = await llm.ainvoke(prompt)
        
        answers.append(response.content)
        contexts.append(context_texts)
    
    return answers, contexts

# Run baseline
print('Running baseline RAG evaluation (no reranking)...')
baseline_answers, baseline_contexts = await evaluate_rag(
    eval_questions, eval_ground_truths, use_reranking=False
)
print(f'Generated {len(baseline_answers)} answers')

In [None]:
from ragas import evaluate as ragas_evaluate
from ragas.metrics import (
    faithfulness,
    answer_relevancy,
    context_precision,
    context_recall,
)

# Create RAGAS dataset
baseline_dataset = Dataset.from_dict({
    'question': eval_questions,
    'answer': baseline_answers,
    'contexts': baseline_contexts,
    'ground_truth': eval_ground_truths,
})

# Run RAGAS evaluation
print('Running RAGAS metrics on baseline...')
baseline_result = ragas_evaluate(
    dataset=baseline_dataset,
    metrics=[faithfulness, answer_relevancy, context_precision, context_recall],
)

baseline_scores = {k: round(v, 4) for k, v in baseline_result.items() if isinstance(v, (int, float))}
print('\n=== Baseline RAG Scores ===')
for metric, score in baseline_scores.items():
    bar = '█' * int(score * 20) + '░' * (20 - int(score * 20))
    print(f'  {metric:<25} {bar} {score:.4f}')

## 3. RAG Evaluation — Improved (With Cohere Reranking)

Now we add **Cohere Rerank** (`rerank-multilingual-v3.0`) to the pipeline.
This retrieves top-5 candidates and reranks them down to top-3,
improving precision and relevance.

This is the **iteration story** required for certification — we show measurable improvement.

In [None]:
# Run improved RAG evaluation (with Cohere reranking)
print('Running improved RAG evaluation (with Cohere reranking)...')
reranked_answers, reranked_contexts = await evaluate_rag(
    eval_questions, eval_ground_truths, use_reranking=True
)

# Create RAGAS dataset
reranked_dataset = Dataset.from_dict({
    'question': eval_questions,
    'answer': reranked_answers,
    'contexts': reranked_contexts,
    'ground_truth': eval_ground_truths,
})

# Run RAGAS evaluation
print('Running RAGAS metrics on reranked pipeline...')
reranked_result = ragas_evaluate(
    dataset=reranked_dataset,
    metrics=[faithfulness, answer_relevancy, context_precision, context_recall],
)

reranked_scores = {k: round(v, 4) for k, v in reranked_result.items() if isinstance(v, (int, float))}
print('\n=== Reranked RAG Scores ===')
for metric, score in reranked_scores.items():
    bar = '█' * int(score * 20) + '░' * (20 - int(score * 20))
    print(f'  {metric:<25} {bar} {score:.4f}')

In [None]:
# Side-by-side comparison
print('\n' + '='*70)
print('COMPARISON: Baseline vs Reranked (Cohere rerank-multilingual-v3.0)')
print('='*70)
print(f'{"Metric":<25} {"Baseline":>10} {"Reranked":>10} {"Delta":>10} {"Improved?":>10}')
print('-'*70)

comparison_data = []
for metric in baseline_scores:
    b = baseline_scores.get(metric, 0)
    r = reranked_scores.get(metric, 0)
    delta = r - b
    improved = '✅' if delta > 0 else ('⚠️' if delta == 0 else '❌')
    delta_str = f'+{delta:.4f}' if delta >= 0 else f'{delta:.4f}'
    print(f'{metric:<25} {b:>10.4f} {r:>10.4f} {delta_str:>10} {improved:>10}')
    comparison_data.append({
        'Metric': metric,
        'Baseline': b,
        'Reranked': r,
        'Delta': delta,
        'Improved': improved,
    })

print('\n')
comparison_df = pd.DataFrame(comparison_data)
display(comparison_df.style.format({'Baseline': '{:.4f}', 'Reranked': '{:.4f}', 'Delta': '{:+.4f}'}))

## 4. Agent Evaluation

We evaluate the full LangGraph Supervisor agent on:
- **Tool Call Accuracy** — Does it route to the right tool?
- **Topic Adherence** — Does the response stay on topic?
- **MiFID II Compliance** — Does it add disclaimers when discussing investments?
- **Language Detection** — Does it respond in the user's language?

In [None]:
# Agent evaluation
from app.services.agent_service import agent_service

DEMO_USER_ID = '00000000-0000-0000-0000-000000000001'

AGENT_TEST_SCENARIOS = [
    {
        'category': 'RAG Query',
        'message': 'Ce este TEZAUR?',
        'expected_topics': ['TEZAUR', 'titluri de stat', 'garantat'],
        'should_have_disclaimer': True,
    },
    {
        'category': 'Market Search',
        'message': 'Care este cursul EUR/RON astazi?',
        'expected_topics': ['EUR', 'RON', 'curs'],
        'should_have_disclaimer': False,
    },
    {
        'category': 'Goals Query',
        'message': 'Care sunt obiectivele mele financiare?',
        'expected_topics': ['obiectiv', 'RON'],
        'should_have_disclaimer': False,
    },
    {
        'category': 'Language (EN)',
        'message': 'What are the differences between TEZAUR and FIDELIS?',
        'expected_topics': ['TEZAUR', 'FIDELIS'],
        'should_have_disclaimer': True,
    },
]

agent_results = []
for i, scenario in enumerate(AGENT_TEST_SCENARIOS, 1):
    print(f'\n--- Scenario {i}: {scenario["category"]} ---')
    print(f'Message: {scenario["message"]}')
    
    response = await agent_service.chat(
        message=scenario['message'],
        user_id=DEMO_USER_ID,
        session_id=f'eval-notebook-{i}',
    )
    
    # Score
    topic_hits = sum(1 for t in scenario['expected_topics'] if t.lower() in response.lower())
    topic_score = topic_hits / len(scenario['expected_topics'])
    has_disclaimer = 'MiFID' in response or 'recomandare' in response.lower()
    disclaimer_ok = has_disclaimer == scenario['should_have_disclaimer']
    overall = topic_score * 0.7 + (1.0 if disclaimer_ok else 0.0) * 0.3
    
    agent_results.append({
        'Category': scenario['category'],
        'Topic Score': f'{topic_score:.0%}',
        'Disclaimer OK': '✅' if disclaimer_ok else '❌',
        'Overall': f'{overall:.2f}',
        'Response Preview': response[:120] + '...',
    })
    print(f'  Score: {overall:.2f} | Topics: {topic_score:.0%} | Disclaimer: {"✅" if disclaimer_ok else "❌"}')
    print(f'  Response: {response[:120]}...')

print('\n\n=== Agent Evaluation Summary ===')
agent_df = pd.DataFrame(agent_results)
display(agent_df)

In [None]:
# Final summary
print('='*60)
print('EVALUATION COMPLETE')
print('='*60)
print(f'\nRAG Baseline Scores:  {baseline_scores}')
print(f'RAG Reranked Scores:  {reranked_scores}')
print(f'Agent Scenarios:      {len(agent_results)} tested')
print(f'Agent Pass Rate:      {sum(1 for r in agent_results if float(r["Overall"]) >= 0.7)}/{len(agent_results)}')
print('\nAll results are available in the cells above for the Loom video walkthrough.')